Snap for 8892598 from 6939824c0cf8321a1718973892371085f7b4edff to mainline-os-statsd-release

Change-Id: Icb4203185703c5ee4af84fbcac16af006c3eb626
diff --git a/Android.bp b/Android.bp
index ecd4bb8..8708fa1 100644
--- a/Android.bp
+++ b/Android.bp
@@ -108,6 +108,7 @@
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
     "libvpx/vp9/decoder/vp9_job_queue.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -347,6 +348,7 @@
     "libvpx/vp9/decoder/vp9_detokenize.c",
     "libvpx/vp9/decoder/vp9_dsubexp.c",
     "libvpx/vp9/decoder/vp9_job_queue.c",
+    "libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_frame_scale_neon.c",
     "libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c",
     "libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c",
@@ -837,8 +839,6 @@
     "libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
     "libvpx/vpx_dsp/x86/sad4d_sse2.asm",
     "libvpx/vpx_dsp/x86/sad_sse2.asm",
-    "libvpx/vpx_dsp/x86/sad_sse3.asm",
-    "libvpx/vpx_dsp/x86/sad_ssse3.asm",
     "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
     "libvpx/vpx_dsp/x86/subtract_sse2.asm",
     "libvpx/vpx_dsp/x86/vpx_convolve_copy_sse2.asm",
@@ -1077,8 +1077,6 @@
     "libvpx/vpx_dsp/x86/inv_wht_sse2.asm",
     "libvpx/vpx_dsp/x86/sad4d_sse2.asm",
     "libvpx/vpx_dsp/x86/sad_sse2.asm",
-    "libvpx/vpx_dsp/x86/sad_sse3.asm",
-    "libvpx/vpx_dsp/x86/sad_ssse3.asm",
     "libvpx/vpx_dsp/x86/ssim_opt_x86_64.asm",
     "libvpx/vpx_dsp/x86/subpel_variance_sse2.asm",
     "libvpx/vpx_dsp/x86/subtract_sse2.asm",
diff --git a/README.android b/README.android
index 30e0c2b..38780ac 100644
--- a/README.android
+++ b/README.android
@@ -1,12 +1,12 @@
 Name: libvpx
 URL: http://www.webmproject.org
-Version: v1.11.0
+Version: v1.12.0
 License: BSD
 License File: libvpx/LICENSE
 
-Date: Thursday October 7 2021
-Branch: origin/smew
-Commit: 626ff35955c2c35b806b3e0ecf551a1a8611cdbf
+Date: Thursday June 30 2022
+Branch: origin/torrent
+Commit: 03265cd42b3783532de72f2ded5436652e6f5ce3
 
 Description:
 Contains the sources used to compile libvpx.
diff --git a/README.version b/README.version
index 4ce368f..7dfba96 100644
--- a/README.version
+++ b/README.version
@@ -1,8 +1,6 @@
-URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.11.0.tar.gz
-Version: v1.11.0
+URL: https://chromium.googlesource.com/webm/libvpx/+archive/v1.12.0.tar.gz
+Version: v1.12.0
 BugComponent: 42195
 Owners: jzern, jianj
 Local Modifications:
-  218b99892 vp8 encoder: fix some integer overflows
-  05f80a920 vp8,calc_pframe_target_size: fix integer overflow
-  7afb3a676 vp8_update_rate_correction_factors: fix integer overflow
+  None
diff --git a/config/arm-neon/vp8_rtcd.h b/config/arm-neon/vp8_rtcd.h
index 328c67e..d204ef7 100644
--- a/config/arm-neon/vp8_rtcd.h
+++ b/config/arm-neon/vp8_rtcd.h
@@ -96,9 +96,6 @@
 void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
 #define vp8_fast_quantize_b vp8_fast_quantize_b_neon
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_neon
diff --git a/config/arm-neon/vp9_rtcd.h b/config/arm-neon/vp9_rtcd.h
index 859eadd..01065e6 100644
--- a/config/arm-neon/vp9_rtcd.h
+++ b/config/arm-neon/vp9_rtcd.h
@@ -41,13 +41,16 @@
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_neon
 
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_neon
 
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_neon
 
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
@@ -79,10 +82,10 @@
 void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -100,12 +103,12 @@
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/arm-neon/vpx_config.asm b/config/arm-neon/vpx_config.asm
index 648aa2b..2ccf56f 100644
--- a/config/arm-neon/vpx_config.asm
+++ b/config/arm-neon/vpx_config.asm
@@ -1,11 +1,12 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 .equ VPX_ARCH_ARM ,  1
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
 .equ HAVE_NEON ,  1
 .equ HAVE_NEON_ASM ,  1
 .equ HAVE_MIPS32 ,  0
@@ -23,6 +24,8 @@
 .equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
 .equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -88,4 +91,4 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_config.h b/config/arm-neon/vpx_config.h
index ddc9557..3fa6606 100644
--- a/config/arm-neon/vpx_config.h
+++ b/config/arm-neon/vpx_config.h
@@ -15,6 +15,7 @@
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON 1
 #define HAVE_NEON_ASM 1
 #define HAVE_MIPS32 0
@@ -32,6 +33,8 @@
 #define HAVE_AVX512 0
 #define HAVE_VSX 0
 #define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
diff --git a/config/arm-neon/vpx_dsp/arm/idct4x4_1_add_neon.asm.S b/config/arm-neon/vpx_dsp/arm/idct4x4_1_add_neon.asm.S
index e27f38a..9e44ccd 100644
--- a/config/arm-neon/vpx_dsp/arm/idct4x4_1_add_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/idct4x4_1_add_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,14 +11,14 @@
 @
 
 
-    .global vpx_idct4x4_1_add_neon 
-	.type vpx_idct4x4_1_add_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_idct4x4_1_add_neon
+    .type vpx_idct4x4_1_add_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
 @void vpx_idct4x4_1_add_neon(int16_t *input, uint8_t *dest, int stride)
 @
@@ -26,8 +26,7 @@
 @ r1  uint8_t *dest
 @ r2  int stride)
 
-_vpx_idct4x4_1_add_neon:
-	vpx_idct4x4_1_add_neon: @ PROC
+vpx_idct4x4_1_add_neon: @ PROC
     ldrsh            r0, [r0]
 
     @ cospi_16_64 = 11585
@@ -67,6 +66,6 @@
     vst1.32          {d7[1]}, [r12]
 
     bx               lr
-	.size vpx_idct4x4_1_add_neon, .-vpx_idct4x4_1_add_neon    @ ENDP             @ |vpx_idct4x4_1_add_neon|
+.size vpx_idct4x4_1_add_neon, .-vpx_idct4x4_1_add_neon    @ ENDP             @ |vpx_idct4x4_1_add_neon|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/idct4x4_add_neon.asm.S b/config/arm-neon/vpx_dsp/arm/idct4x4_add_neon.asm.S
index b35667c..3a21bb4 100644
--- a/config/arm-neon/vpx_dsp/arm/idct4x4_add_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/idct4x4_add_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,27 +11,26 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_idct4x4_16_add_neon 
-	.type vpx_idct4x4_16_add_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_idct4x4_16_add_neon
+    .type vpx_idct4x4_16_add_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-    .include  "vpx_dsp/arm/idct_neon.asm.S"
+    .include "vpx_dsp/arm/idct_neon.asm.S"
 
-.text
-.p2align 2@ name this block of code
+    .text
+    .p2align 2
 @void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 @
 @ r0  int16_t input
 @ r1  uint8_t *dest
 @ r2  int stride)
 
-_vpx_idct4x4_16_add_neon:
-	vpx_idct4x4_16_add_neon: @ PROC
+vpx_idct4x4_16_add_neon: @ PROC
 
     @ The 2D transform is done with two passes which are actually pretty
     @ similar. We first transform the rows. This is done by transposing
@@ -190,6 +189,6 @@
     vst1.32 {d26[1]}, [r1], r2
     vst1.32 {d26[0]}, [r1]  @ no post-increment
     bx              lr
-	.size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon    @ ENDP  @ |vpx_idct4x4_16_add_neon|
+.size vpx_idct4x4_16_add_neon, .-vpx_idct4x4_16_add_neon    @ ENDP  @ |vpx_idct4x4_16_add_neon|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/idct_neon.asm.S b/config/arm-neon/vpx_dsp/arm/idct_neon.asm.S
index e6ee7ca..0033be8 100644
--- a/config/arm-neon/vpx_dsp/arm/idct_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/idct_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2016 The WebM project authors. All Rights Reserved.
 @
@@ -11,12 +11,12 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .include  "./vpx_config.asm"
+    .include "./vpx_config.asm"
 
     @ Helper functions used to load tran_low_t into int16, narrowing if
     @ necessary.
 
-    @ $dst0..3 are d registers with the pairs assumed to be contiguous in
+    @ \dst0..3 are d registers with the pairs assumed to be contiguous in
     @ non-high-bitdepth builds. q0-q3 are used as temporaries in high-bitdepth.
 .macro    LOAD_TRAN_LOW_TO_S16 dst0, dst1, dst2, dst3, src
     .if CONFIG_VP9_HIGHBITDEPTH
@@ -31,7 +31,7 @@
     .endif
     .endm
 
-    @ $dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
+    @ \dst0..3 are d registers. q0-q3 are used as temporaries in high-bitdepth.
 .macro    LOAD_TRAN_LOW_TO_S16X2 dst0, dst1, dst2, dst3, src
     .if CONFIG_VP9_HIGHBITDEPTH
     vld2.s32        {q0,q1}, [\src]!
@@ -44,4 +44,4 @@
     vld2.s16        {\dst0,\dst1,\dst2,\dst3}, [\src]!
     .endif
     .endm
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/intrapred_neon_asm.asm.S b/config/arm-neon/vpx_dsp/arm/intrapred_neon_asm.asm.S
index 9f656b8..130dcd9 100644
--- a/config/arm-neon/vpx_dsp/arm/intrapred_neon_asm.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/intrapred_neon_asm.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
 @
@@ -11,36 +11,36 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_v_predictor_4x4_neon 
-	.type vpx_v_predictor_4x4_neon, function
-    .global vpx_v_predictor_8x8_neon 
-	.type vpx_v_predictor_8x8_neon, function
-    .global vpx_v_predictor_16x16_neon 
-	.type vpx_v_predictor_16x16_neon, function
-    .global vpx_v_predictor_32x32_neon 
-	.type vpx_v_predictor_32x32_neon, function
-    .global vpx_h_predictor_4x4_neon 
-	.type vpx_h_predictor_4x4_neon, function
-    .global vpx_h_predictor_8x8_neon 
-	.type vpx_h_predictor_8x8_neon, function
-    .global vpx_h_predictor_16x16_neon 
-	.type vpx_h_predictor_16x16_neon, function
-    .global vpx_h_predictor_32x32_neon 
-	.type vpx_h_predictor_32x32_neon, function
-    .global vpx_tm_predictor_4x4_neon 
-	.type vpx_tm_predictor_4x4_neon, function
-    .global vpx_tm_predictor_8x8_neon 
-	.type vpx_tm_predictor_8x8_neon, function
-    .global vpx_tm_predictor_16x16_neon 
-	.type vpx_tm_predictor_16x16_neon, function
-    .global vpx_tm_predictor_32x32_neon 
-	.type vpx_tm_predictor_32x32_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_v_predictor_4x4_neon
+    .type vpx_v_predictor_4x4_neon, function
+    .global vpx_v_predictor_8x8_neon
+    .type vpx_v_predictor_8x8_neon, function
+    .global vpx_v_predictor_16x16_neon
+    .type vpx_v_predictor_16x16_neon, function
+    .global vpx_v_predictor_32x32_neon
+    .type vpx_v_predictor_32x32_neon, function
+    .global vpx_h_predictor_4x4_neon
+    .type vpx_h_predictor_4x4_neon, function
+    .global vpx_h_predictor_8x8_neon
+    .type vpx_h_predictor_8x8_neon, function
+    .global vpx_h_predictor_16x16_neon
+    .type vpx_h_predictor_16x16_neon, function
+    .global vpx_h_predictor_32x32_neon
+    .type vpx_h_predictor_32x32_neon, function
+    .global vpx_tm_predictor_4x4_neon
+    .type vpx_tm_predictor_4x4_neon, function
+    .global vpx_tm_predictor_8x8_neon
+    .type vpx_tm_predictor_8x8_neon, function
+    .global vpx_tm_predictor_16x16_neon
+    .type vpx_tm_predictor_16x16_neon, function
+    .global vpx_tm_predictor_32x32_neon
+    .type vpx_tm_predictor_32x32_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
 @void vpx_v_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                              const uint8_t *above,
@@ -50,15 +50,14 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_v_predictor_4x4_neon:
-	vpx_v_predictor_4x4_neon: @ PROC
+vpx_v_predictor_4x4_neon: @ PROC
     vld1.32             {d0[0]}, [r2]
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d0[0]}, [r0], r1
     bx                  lr
-	.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
+.size vpx_v_predictor_4x4_neon, .-vpx_v_predictor_4x4_neon    @ ENDP                @ |vpx_v_predictor_4x4_neon|
 
 @void vpx_v_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                              const uint8_t *above,
@@ -68,8 +67,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_v_predictor_8x8_neon:
-	vpx_v_predictor_8x8_neon: @ PROC
+vpx_v_predictor_8x8_neon: @ PROC
     vld1.8              {d0}, [r2]
     vst1.8              {d0}, [r0], r1
     vst1.8              {d0}, [r0], r1
@@ -80,7 +78,7 @@
     vst1.8              {d0}, [r0], r1
     vst1.8              {d0}, [r0], r1
     bx                  lr
-	.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
+.size vpx_v_predictor_8x8_neon, .-vpx_v_predictor_8x8_neon    @ ENDP                @ |vpx_v_predictor_8x8_neon|
 
 @void vpx_v_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -90,8 +88,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_v_predictor_16x16_neon:
-	vpx_v_predictor_16x16_neon: @ PROC
+vpx_v_predictor_16x16_neon: @ PROC
     vld1.8              {q0}, [r2]
     vst1.8              {q0}, [r0], r1
     vst1.8              {q0}, [r0], r1
@@ -110,7 +107,7 @@
     vst1.8              {q0}, [r0], r1
     vst1.8              {q0}, [r0], r1
     bx                  lr
-	.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
+.size vpx_v_predictor_16x16_neon, .-vpx_v_predictor_16x16_neon    @ ENDP                @ |vpx_v_predictor_16x16_neon|
 
 @void vpx_v_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -120,8 +117,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_v_predictor_32x32_neon:
-	vpx_v_predictor_32x32_neon: @ PROC
+vpx_v_predictor_32x32_neon: @ PROC
     vld1.8              {q0, q1}, [r2]
     mov                 r2, #2
 loop_v:
@@ -144,7 +140,7 @@
     subs                r2, r2, #1
     bgt                 loop_v
     bx                  lr
-	.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
+.size vpx_v_predictor_32x32_neon, .-vpx_v_predictor_32x32_neon    @ ENDP                @ |vpx_v_predictor_32x32_neon|
 
 @void vpx_h_predictor_4x4_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                              const uint8_t *above,
@@ -154,8 +150,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_h_predictor_4x4_neon:
-	vpx_h_predictor_4x4_neon: @ PROC
+vpx_h_predictor_4x4_neon: @ PROC
     vld1.32             {d1[0]}, [r3]
     vdup.8              d0, d1[0]
     vst1.32             {d0[0]}, [r0], r1
@@ -166,7 +161,7 @@
     vdup.8              d0, d1[3]
     vst1.32             {d0[0]}, [r0], r1
     bx                  lr
-	.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
+.size vpx_h_predictor_4x4_neon, .-vpx_h_predictor_4x4_neon    @ ENDP                @ |vpx_h_predictor_4x4_neon|
 
 @void vpx_h_predictor_8x8_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                              const uint8_t *above,
@@ -176,8 +171,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_h_predictor_8x8_neon:
-	vpx_h_predictor_8x8_neon: @ PROC
+vpx_h_predictor_8x8_neon: @ PROC
     vld1.64             {d1}, [r3]
     vdup.8              d0, d1[0]
     vst1.64             {d0}, [r0], r1
@@ -196,7 +190,7 @@
     vdup.8              d0, d1[7]
     vst1.64             {d0}, [r0], r1
     bx                  lr
-	.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
+.size vpx_h_predictor_8x8_neon, .-vpx_h_predictor_8x8_neon    @ ENDP                @ |vpx_h_predictor_8x8_neon|
 
 @void vpx_h_predictor_16x16_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -206,8 +200,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_h_predictor_16x16_neon:
-	vpx_h_predictor_16x16_neon: @ PROC
+vpx_h_predictor_16x16_neon: @ PROC
     vld1.8              {q1}, [r3]
     vdup.8              q0, d2[0]
     vst1.8              {q0}, [r0], r1
@@ -242,7 +235,7 @@
     vdup.8              q0, d3[7]
     vst1.8              {q0}, [r0], r1
     bx                  lr
-	.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
+.size vpx_h_predictor_16x16_neon, .-vpx_h_predictor_16x16_neon    @ ENDP                @ |vpx_h_predictor_16x16_neon|
 
 @void vpx_h_predictor_32x32_neon(uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -252,8 +245,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_h_predictor_32x32_neon:
-	vpx_h_predictor_32x32_neon: @ PROC
+vpx_h_predictor_32x32_neon: @ PROC
     sub                 r1, r1, #16
     mov                 r2, #2
 loop_h:
@@ -309,7 +301,7 @@
     subs                r2, r2, #1
     bgt                 loop_h
     bx                  lr
-	.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
+.size vpx_h_predictor_32x32_neon, .-vpx_h_predictor_32x32_neon    @ ENDP                @ |vpx_h_predictor_32x32_neon|
 
 @void vpx_tm_predictor_4x4_neon (uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -319,8 +311,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_tm_predictor_4x4_neon:
-	vpx_tm_predictor_4x4_neon: @ PROC
+vpx_tm_predictor_4x4_neon: @ PROC
     @ Load ytop_left = above[-1];
     sub                 r12, r2, #1
     vld1.u8             {d0[]}, [r12]
@@ -356,7 +347,7 @@
     vst1.32             {d0[0]}, [r0], r1
     vst1.32             {d1[0]}, [r0], r1
     bx                  lr
-	.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
+.size vpx_tm_predictor_4x4_neon, .-vpx_tm_predictor_4x4_neon    @ ENDP                @ |vpx_tm_predictor_4x4_neon|
 
 @void vpx_tm_predictor_8x8_neon (uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -366,8 +357,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_tm_predictor_8x8_neon:
-	vpx_tm_predictor_8x8_neon: @ PROC
+vpx_tm_predictor_8x8_neon: @ PROC
     @ Load ytop_left = above[-1];
     sub                 r12, r2, #1
     vld1.8              {d0[]}, [r12]
@@ -429,7 +419,7 @@
     vst1.64             {d3}, [r0], r1
 
     bx                  lr
-	.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
+.size vpx_tm_predictor_8x8_neon, .-vpx_tm_predictor_8x8_neon    @ ENDP                @ |vpx_tm_predictor_8x8_neon|
 
 @void vpx_tm_predictor_16x16_neon (uint8_t *dst, ptrdiff_t y_stride,
 @                                const uint8_t *above,
@@ -439,8 +429,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_tm_predictor_16x16_neon:
-	vpx_tm_predictor_16x16_neon: @ PROC
+vpx_tm_predictor_16x16_neon: @ PROC
     @ Load ytop_left = above[-1];
     sub                 r12, r2, #1
     vld1.8              {d0[]}, [r12]
@@ -523,7 +512,7 @@
     bgt                 loop_16x16_neon
 
     bx                  lr
-	.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
+.size vpx_tm_predictor_16x16_neon, .-vpx_tm_predictor_16x16_neon    @ ENDP                @ |vpx_tm_predictor_16x16_neon|
 
 @void vpx_tm_predictor_32x32_neon (uint8_t *dst, ptrdiff_t y_stride,
 @                                  const uint8_t *above,
@@ -533,8 +522,7 @@
 @ r2  const uint8_t *above
 @ r3  const uint8_t *left
 
-_vpx_tm_predictor_32x32_neon:
-	vpx_tm_predictor_32x32_neon: @ PROC
+vpx_tm_predictor_32x32_neon: @ PROC
     @ Load ytop_left = above[-1];
     sub                 r12, r2, #1
     vld1.8              {d0[]}, [r12]
@@ -653,6 +641,6 @@
     bgt                 loop_32x32_neon
 
     bx                  lr
-	.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
+.size vpx_tm_predictor_32x32_neon, .-vpx_tm_predictor_32x32_neon    @ ENDP                @ |vpx_tm_predictor_32x32_neon|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/loopfilter_16_neon.asm.S b/config/arm-neon/vpx_dsp/arm/loopfilter_16_neon.asm.S
index ac8f348..d63367b 100644
--- a/config/arm-neon/vpx_dsp/arm/loopfilter_16_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/loopfilter_16_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,18 +11,18 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_lpf_horizontal_16_neon 
-	.type vpx_lpf_horizontal_16_neon, function
-    .global vpx_lpf_horizontal_16_dual_neon 
-	.type vpx_lpf_horizontal_16_dual_neon, function
-    .global vpx_lpf_vertical_16_neon 
-	.type vpx_lpf_vertical_16_neon, function
-    .global vpx_lpf_vertical_16_dual_neon 
-	.type vpx_lpf_vertical_16_dual_neon, function
-   .arm
+    .global vpx_lpf_horizontal_16_neon
+    .type vpx_lpf_horizontal_16_neon, function
+    .global vpx_lpf_horizontal_16_dual_neon
+    .type vpx_lpf_horizontal_16_dual_neon, function
+    .global vpx_lpf_vertical_16_neon
+    .type vpx_lpf_vertical_16_neon, function
+    .global vpx_lpf_vertical_16_dual_neon
+    .type vpx_lpf_vertical_16_dual_neon, function
+    .arm
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
 @ void mb_lpf_horizontal_edge(uint8_t *s, int p,
 @                             const uint8_t *blimit,
@@ -35,8 +35,7 @@
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
 @ r12   int count
-_mb_lpf_horizontal_edge:
-	mb_lpf_horizontal_edge: @ PROC
+mb_lpf_horizontal_edge: @ PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              @ load thresh
@@ -126,7 +125,7 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-	.size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge    @ ENDP        @ |mb_lpf_horizontal_edge|
+.size mb_lpf_horizontal_edge, .-mb_lpf_horizontal_edge    @ ENDP        @ |mb_lpf_horizontal_edge|
 
 @ void vpx_lpf_horizontal_16_neon(uint8_t *s, int pitch,
 @                                     const uint8_t *blimit,
@@ -137,11 +136,10 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_16_neon:
-	vpx_lpf_horizontal_16_neon: @ PROC
+vpx_lpf_horizontal_16_neon: @ PROC
     mov r12, #1
     b mb_lpf_horizontal_edge
-	.size vpx_lpf_horizontal_16_neon, .-vpx_lpf_horizontal_16_neon    @ ENDP        @ |vpx_lpf_horizontal_16_neon|
+.size vpx_lpf_horizontal_16_neon, .-vpx_lpf_horizontal_16_neon    @ ENDP        @ |vpx_lpf_horizontal_16_neon|
 
 @ void vpx_lpf_horizontal_16_dual_neon(uint8_t *s, int pitch,
 @                                      const uint8_t *blimit,
@@ -152,11 +150,10 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh
-_vpx_lpf_horizontal_16_dual_neon:
-	vpx_lpf_horizontal_16_dual_neon: @ PROC
+vpx_lpf_horizontal_16_dual_neon: @ PROC
     mov r12, #2
     b mb_lpf_horizontal_edge
-	.size vpx_lpf_horizontal_16_dual_neon, .-vpx_lpf_horizontal_16_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_16_dual_neon|
+.size vpx_lpf_horizontal_16_dual_neon, .-vpx_lpf_horizontal_16_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_16_dual_neon|
 
 @ void mb_lpf_vertical_edge_w(uint8_t *s, int p, const uint8_t *blimit,
 @                             const uint8_t *limit, const uint8_t *thresh,
@@ -167,8 +164,7 @@
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
 @ r12   int count
-_mb_lpf_vertical_edge_w:
-	mb_lpf_vertical_edge_w: @ PROC
+mb_lpf_vertical_edge_w: @ PROC
     push        {r4-r8, lr}
     vpush       {d8-d15}
     ldr         r4, [sp, #88]              @ load thresh
@@ -326,7 +322,7 @@
     vpop        {d8-d15}
     pop         {r4-r8, pc}
 
-	.size mb_lpf_vertical_edge_w, .-mb_lpf_vertical_edge_w    @ ENDP        @ |mb_lpf_vertical_edge_w|
+.size mb_lpf_vertical_edge_w, .-mb_lpf_vertical_edge_w    @ ENDP        @ |mb_lpf_vertical_edge_w|
 
 @ void vpx_lpf_vertical_16_neon(uint8_t *s, int p, const uint8_t *blimit,
 @                               const uint8_t *limit, const uint8_t *thresh)
@@ -335,11 +331,10 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh
-_vpx_lpf_vertical_16_neon:
-	vpx_lpf_vertical_16_neon: @ PROC
+vpx_lpf_vertical_16_neon: @ PROC
     mov r12, #1
     b mb_lpf_vertical_edge_w
-	.size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon    @ ENDP        @ |vpx_lpf_vertical_16_neon|
+.size vpx_lpf_vertical_16_neon, .-vpx_lpf_vertical_16_neon    @ ENDP        @ |vpx_lpf_vertical_16_neon|
 
 @ void vpx_lpf_vertical_16_dual_neon(uint8_t *s, int p, const uint8_t *blimit,
 @                                    const uint8_t *limit,
@@ -349,11 +344,10 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh
-_vpx_lpf_vertical_16_dual_neon:
-	vpx_lpf_vertical_16_dual_neon: @ PROC
+vpx_lpf_vertical_16_dual_neon: @ PROC
     mov r12, #2
     b mb_lpf_vertical_edge_w
-	.size vpx_lpf_vertical_16_dual_neon, .-vpx_lpf_vertical_16_dual_neon    @ ENDP        @ |vpx_lpf_vertical_16_dual_neon|
+.size vpx_lpf_vertical_16_dual_neon, .-vpx_lpf_vertical_16_dual_neon    @ ENDP        @ |vpx_lpf_vertical_16_dual_neon|
 
 @ void vpx_wide_mbfilter_neon();
 @ This is a helper function for the loopfilters. The invidual functions do the
@@ -379,8 +373,7 @@
 @ d13   q5
 @ d14   q6
 @ d15   q7
-_vpx_wide_mbfilter_neon:
-	vpx_wide_mbfilter_neon: @ PROC
+vpx_wide_mbfilter_neon: @ PROC
     mov         r7, #0
 
     @ filter_mask
@@ -676,6 +669,6 @@
     vbif        d3, d14, d17               @ oq6 |= q6 & ~(f2 & f & m)
 
     bx          lr
-	.size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon    @ ENDP        @ |vpx_wide_mbfilter_neon|
+.size vpx_wide_mbfilter_neon, .-vpx_wide_mbfilter_neon    @ ENDP        @ |vpx_wide_mbfilter_neon|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/loopfilter_4_neon.asm.S b/config/arm-neon/vpx_dsp/arm/loopfilter_4_neon.asm.S
index 712a534..6a308cb 100644
--- a/config/arm-neon/vpx_dsp/arm/loopfilter_4_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/loopfilter_4_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,18 +11,18 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_lpf_horizontal_4_neon 
-	.type vpx_lpf_horizontal_4_neon, function
-    .global vpx_lpf_vertical_4_neon 
-	.type vpx_lpf_vertical_4_neon, function
-    .global vpx_lpf_horizontal_4_dual_neon 
-	.type vpx_lpf_horizontal_4_dual_neon, function
-    .global vpx_lpf_vertical_4_dual_neon 
-	.type vpx_lpf_vertical_4_dual_neon, function
-   .arm
+    .global vpx_lpf_horizontal_4_neon
+    .type vpx_lpf_horizontal_4_neon, function
+    .global vpx_lpf_vertical_4_neon
+    .type vpx_lpf_vertical_4_neon, function
+    .global vpx_lpf_horizontal_4_dual_neon
+    .type vpx_lpf_horizontal_4_dual_neon, function
+    .global vpx_lpf_vertical_4_dual_neon
+    .type vpx_lpf_vertical_4_dual_neon, function
+    .arm
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
 @ Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 @ works on 16 iterations at a time.
@@ -38,8 +38,7 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
-_vpx_lpf_horizontal_4_neon:
-	vpx_lpf_horizontal_4_neon: @ PROC
+vpx_lpf_horizontal_4_neon: @ PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]               @ duplicate *blimit
@@ -72,7 +71,7 @@
     vst1.u8     {d7}, [r3,:64], r1          @ store oq1
 
     pop         {pc}
-	.size vpx_lpf_horizontal_4_neon, .-vpx_lpf_horizontal_4_neon    @ ENDP        @ |vpx_lpf_horizontal_4_neon|
+.size vpx_lpf_horizontal_4_neon, .-vpx_lpf_horizontal_4_neon    @ ENDP        @ |vpx_lpf_horizontal_4_neon|
 
 @ Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 @ works on 16 iterations at a time.
@@ -88,8 +87,7 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
-_vpx_lpf_vertical_4_neon:
-	vpx_lpf_vertical_4_neon: @ PROC
+vpx_lpf_vertical_4_neon: @ PROC
     push        {lr}
 
     vld1.8      {d0[]}, [r2]              @ duplicate *blimit
@@ -140,7 +138,7 @@
     vst4.8      {d4[7], d5[7], d6[7], d7[7]}, [r0]
 
     pop         {pc}
-	.size vpx_lpf_vertical_4_neon, .-vpx_lpf_vertical_4_neon    @ ENDP        @ |vpx_lpf_vertical_4_neon|
+.size vpx_lpf_vertical_4_neon, .-vpx_lpf_vertical_4_neon    @ ENDP        @ |vpx_lpf_vertical_4_neon|
 
 @ void filter4_8();
 @ This is a helper function for the loopfilters. The invidual functions do the
@@ -166,8 +164,7 @@
 @ d5    op0
 @ d6    oq0
 @ d7    oq1
-_filter4_8:
-	filter4_8: @ PROC
+filter4_8: @ PROC
     @ filter_mask
     vabd.u8     d19, d3, d4                 @ m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                 @ m2 = abs(p2 - p1)
@@ -257,7 +254,7 @@
     veor        d7, d20, d18                @ *oq1 = u^0x80
 
     bx          lr
-	.size filter4_8, .-filter4_8    @ ENDP        @ |filter4_8|
+.size filter4_8, .-filter4_8    @ ENDP        @ |filter4_8|
 
 @void vpx_lpf_horizontal_4_dual_neon(uint8_t *s, int p,
 @                                    const uint8_t *blimit0,
@@ -275,8 +272,7 @@
 @ sp+8  const uint8_t *limit1,
 @ sp+12 const uint8_t *thresh1,
 
-_vpx_lpf_horizontal_4_dual_neon:
-	vpx_lpf_horizontal_4_dual_neon: @ PROC
+vpx_lpf_horizontal_4_dual_neon: @ PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              @ load thresh0
@@ -323,7 +319,7 @@
     vpop        {d8-d15}                   @ restore neon registers
 
     pop         {pc}
-	.size vpx_lpf_horizontal_4_dual_neon, .-vpx_lpf_horizontal_4_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_4_dual_neon|
+.size vpx_lpf_horizontal_4_dual_neon, .-vpx_lpf_horizontal_4_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_4_dual_neon|
 
 @void vpx_lpf_vertical_4_dual_neon(uint8_t *s, int p,
 @                                  const uint8_t *blimit0,
@@ -341,8 +337,7 @@
 @ sp+8  const uint8_t *limit1,
 @ sp+12 const uint8_t *thresh1,
 
-_vpx_lpf_vertical_4_dual_neon:
-	vpx_lpf_vertical_4_dual_neon: @ PROC
+vpx_lpf_vertical_4_dual_neon: @ PROC
     push        {lr}
 
     ldr         r12, [sp, #4]              @ load thresh0
@@ -439,7 +434,7 @@
     vpop        {d8-d15}                   @ restore neon registers
 
     pop         {pc}
-	.size vpx_lpf_vertical_4_dual_neon, .-vpx_lpf_vertical_4_dual_neon    @ ENDP        @ |vpx_lpf_vertical_4_dual_neon|
+.size vpx_lpf_vertical_4_dual_neon, .-vpx_lpf_vertical_4_dual_neon    @ ENDP        @ |vpx_lpf_vertical_4_dual_neon|
 
 @ void filter4_16();
 @ This is a helper function for the loopfilters. The invidual functions do the
@@ -464,8 +459,7 @@
 @ q6    op0
 @ q7    oq0
 @ q8    oq1
-_filter4_16:
-	filter4_16: @ PROC
+filter4_16: @ PROC
 
     @ filter_mask
     vabd.u8     q11, q3, q4                 @ m1 = abs(p3 - p2)
@@ -558,6 +552,6 @@
     veor        q8, q12, q10                @ *oq1 = u^0x80
 
     bx          lr
-	.size filter4_16, .-filter4_16    @ ENDP        @ |filter4_16|
+.size filter4_16, .-filter4_16    @ ENDP        @ |filter4_16|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/loopfilter_8_neon.asm.S b/config/arm-neon/vpx_dsp/arm/loopfilter_8_neon.asm.S
index f5b6758..2a413be 100644
--- a/config/arm-neon/vpx_dsp/arm/loopfilter_8_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/loopfilter_8_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,18 +11,18 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_lpf_horizontal_8_neon 
-	.type vpx_lpf_horizontal_8_neon, function
-    .global vpx_lpf_horizontal_8_dual_neon 
-	.type vpx_lpf_horizontal_8_dual_neon, function
-    .global vpx_lpf_vertical_8_neon 
-	.type vpx_lpf_vertical_8_neon, function
-    .global vpx_lpf_vertical_8_dual_neon 
-	.type vpx_lpf_vertical_8_dual_neon, function
-   .arm
+    .global vpx_lpf_horizontal_8_neon
+    .type vpx_lpf_horizontal_8_neon, function
+    .global vpx_lpf_horizontal_8_dual_neon
+    .type vpx_lpf_horizontal_8_dual_neon, function
+    .global vpx_lpf_vertical_8_neon
+    .type vpx_lpf_vertical_8_neon, function
+    .global vpx_lpf_vertical_8_dual_neon
+    .type vpx_lpf_vertical_8_dual_neon, function
+    .arm
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
 @ Currently vpx only works on iterations 8 at a time. The vp8 loop filter
 @ works on 16 iterations at a time.
@@ -36,8 +36,7 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
-_vpx_lpf_horizontal_8_neon:
-	vpx_lpf_horizontal_8_neon: @ PROC
+vpx_lpf_horizontal_8_neon: @ PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]               @ duplicate *blimit
@@ -73,7 +72,7 @@
 
     pop         {r4-r5, pc}
 
-	.size vpx_lpf_horizontal_8_neon, .-vpx_lpf_horizontal_8_neon    @ ENDP        @ |vpx_lpf_horizontal_8_neon|
+.size vpx_lpf_horizontal_8_neon, .-vpx_lpf_horizontal_8_neon    @ ENDP        @ |vpx_lpf_horizontal_8_neon|
 
 @void vpx_lpf_horizontal_8_dual_neon(uint8_t *s,
 @                                    int p,
@@ -91,8 +90,7 @@
 @ sp + 4  const uint8_t *blimit1,
 @ sp + 8  const uint8_t *limit1,
 @ sp + 12 const uint8_t *thresh1,
-_vpx_lpf_horizontal_8_dual_neon:
-	vpx_lpf_horizontal_8_dual_neon: @ PROC
+vpx_lpf_horizontal_8_dual_neon: @ PROC
     push        {r0-r1, lr}
     ldr         lr, [sp, #12]
     push        {lr}                       @ thresh0
@@ -106,7 +104,7 @@
     pop         {r0-r1, lr}
     add         r0, #8                     @ s + 8
     b           vpx_lpf_horizontal_8_neon
-	.size vpx_lpf_horizontal_8_dual_neon, .-vpx_lpf_horizontal_8_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_8_dual_neon|
+.size vpx_lpf_horizontal_8_dual_neon, .-vpx_lpf_horizontal_8_dual_neon    @ ENDP        @ |vpx_lpf_horizontal_8_dual_neon|
 
 @ void vpx_lpf_vertical_8_neon(uint8_t *s,
 @                              int pitch,
@@ -119,8 +117,7 @@
 @ r2    const uint8_t *blimit,
 @ r3    const uint8_t *limit,
 @ sp    const uint8_t *thresh,
-_vpx_lpf_vertical_8_neon:
-	vpx_lpf_vertical_8_neon: @ PROC
+vpx_lpf_vertical_8_neon: @ PROC
     push        {r4-r5, lr}
 
     vld1.8      {d0[]}, [r2]              @ duplicate *blimit
@@ -182,7 +179,7 @@
     vst2.8      {d4[7], d5[7]}, [r3]
 
     pop         {r4-r5, pc}
-	.size vpx_lpf_vertical_8_neon, .-vpx_lpf_vertical_8_neon    @ ENDP        @ |vpx_lpf_vertical_8_neon|
+.size vpx_lpf_vertical_8_neon, .-vpx_lpf_vertical_8_neon    @ ENDP        @ |vpx_lpf_vertical_8_neon|
 
 @void vpx_lpf_vertical_8_dual_neon(uint8_t *s,
 @                                  int pitch,
@@ -200,8 +197,7 @@
 @ sp + 4  const uint8_t *blimit1,
 @ sp + 8  const uint8_t *limit1,
 @ sp + 12 const uint8_t *thresh1,
-_vpx_lpf_vertical_8_dual_neon:
-	vpx_lpf_vertical_8_dual_neon: @ PROC
+vpx_lpf_vertical_8_dual_neon: @ PROC
     push        {r0-r1, lr}
     ldr         lr, [sp, #12]
     push        {lr}                       @ thresh0
@@ -215,7 +211,7 @@
     pop         {r0-r1, lr}
     add         r0, r0, r1, lsl #3         @ s + 8 * pitch
     b           vpx_lpf_vertical_8_neon
-	.size vpx_lpf_vertical_8_dual_neon, .-vpx_lpf_vertical_8_dual_neon    @ ENDP        @ |vpx_lpf_vertical_8_dual_neon|
+.size vpx_lpf_vertical_8_dual_neon, .-vpx_lpf_vertical_8_dual_neon    @ ENDP        @ |vpx_lpf_vertical_8_dual_neon|
 
 @ void vpx_mbloop_filter_neon();
 @ This is a helper function for the loopfilters. The invidual functions do the
@@ -243,8 +239,7 @@
 @ d3    oq0
 @ d4    oq1
 @ d5    oq2
-_vpx_mbloop_filter_neon:
-	vpx_mbloop_filter_neon: @ PROC
+vpx_mbloop_filter_neon: @ PROC
     @ filter_mask
     vabd.u8     d19, d3, d4                @ m1 = abs(p3 - p2)
     vabd.u8     d20, d4, d5                @ m2 = abs(p2 - p1)
@@ -499,6 +494,6 @@
 
     bx          lr
 
-	.size vpx_mbloop_filter_neon, .-vpx_mbloop_filter_neon    @ ENDP        @ |vpx_mbloop_filter_neon|
+.size vpx_mbloop_filter_neon, .-vpx_mbloop_filter_neon    @ ENDP        @ |vpx_mbloop_filter_neon|
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/save_reg_neon.asm.S b/config/arm-neon/vpx_dsp/arm/save_reg_neon.asm.S
index ebd6bae..3cee6e5 100644
--- a/config/arm-neon/vpx_dsp/arm/save_reg_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/save_reg_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
 @
@@ -12,31 +12,29 @@
 @
 
 
-    .global vpx_push_neon 
-	.type vpx_push_neon, function
-    .global vpx_pop_neon 
-	.type vpx_pop_neon, function
+    .global vpx_push_neon
+    .type vpx_push_neon, function
+    .global vpx_pop_neon
+    .type vpx_pop_neon, function
 
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_push_neon:
-	vpx_push_neon: @ PROC
+vpx_push_neon: @ PROC
     vstm            r0!, {d8-d15}
     bx              lr
 
-	.size vpx_push_neon, .-vpx_push_neon    @ ENDP
+.size vpx_push_neon, .-vpx_push_neon    @ ENDP
 
-_vpx_pop_neon:
-	vpx_pop_neon: @ PROC
+vpx_pop_neon: @ PROC
     vldm            r0!, {d8-d15}
     bx              lr
 
-	.size vpx_pop_neon, .-vpx_pop_neon    @ ENDP
+.size vpx_pop_neon, .-vpx_pop_neon    @ ENDP
 
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm.S
index f5cdea5..00aa2b6 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type1_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r8 =>  ht
 @    r10 =>  wd
 
-    .global vpx_convolve8_avg_horiz_filter_type1_neon 
-	.type vpx_convolve8_avg_horiz_filter_type1_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_avg_horiz_filter_type1_neon
+    .type vpx_convolve8_avg_horiz_filter_type1_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_avg_horiz_filter_type1_neon:
-	vpx_convolve8_avg_horiz_filter_type1_neon: @ PROC
+vpx_convolve8_avg_horiz_filter_type1_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -439,6 +438,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_avg_horiz_filter_type1_neon, .-vpx_convolve8_avg_horiz_filter_type1_neon    @ ENDP
+.size vpx_convolve8_avg_horiz_filter_type1_neon, .-vpx_convolve8_avg_horiz_filter_type1_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm.S
index 9e2b43f..000d413 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_horiz_filter_type2_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r8 =>  ht
 @    r10 =>  wd
 
-    .global vpx_convolve8_avg_horiz_filter_type2_neon 
-	.type vpx_convolve8_avg_horiz_filter_type2_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_avg_horiz_filter_type2_neon
+    .type vpx_convolve8_avg_horiz_filter_type2_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_avg_horiz_filter_type2_neon:
-	vpx_convolve8_avg_horiz_filter_type2_neon: @ PROC
+vpx_convolve8_avg_horiz_filter_type2_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -440,6 +439,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_avg_horiz_filter_type2_neon, .-vpx_convolve8_avg_horiz_filter_type2_neon    @ ENDP
+.size vpx_convolve8_avg_horiz_filter_type2_neon, .-vpx_convolve8_avg_horiz_filter_type2_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm.S
index ec039e4..ce6f09d 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type1_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r5 =>  ht
 @    r3 =>  wd
 
-    .global vpx_convolve8_avg_vert_filter_type1_neon 
-	.type vpx_convolve8_avg_vert_filter_type1_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_avg_vert_filter_type1_neon
+    .type vpx_convolve8_avg_vert_filter_type1_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_avg_vert_filter_type1_neon:
-	vpx_convolve8_avg_vert_filter_type1_neon: @ PROC
+vpx_convolve8_avg_vert_filter_type1_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -487,6 +486,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_avg_vert_filter_type1_neon, .-vpx_convolve8_avg_vert_filter_type1_neon    @ ENDP
+.size vpx_convolve8_avg_vert_filter_type1_neon, .-vpx_convolve8_avg_vert_filter_type1_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm.S
index 0317a71..6b89681 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_avg_vert_filter_type2_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r5 =>  ht
 @    r3 =>  wd
 
-    .global vpx_convolve8_avg_vert_filter_type2_neon 
-	.type vpx_convolve8_avg_vert_filter_type2_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_avg_vert_filter_type2_neon
+    .type vpx_convolve8_avg_vert_filter_type2_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_avg_vert_filter_type2_neon:
-	vpx_convolve8_avg_vert_filter_type2_neon: @ PROC
+vpx_convolve8_avg_vert_filter_type2_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -488,6 +487,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_avg_vert_filter_type2_neon, .-vpx_convolve8_avg_vert_filter_type2_neon    @ ENDP
+.size vpx_convolve8_avg_vert_filter_type2_neon, .-vpx_convolve8_avg_vert_filter_type2_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm.S
index 44031d3..8539269 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type1_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r8 =>  ht
 @    r10 =>  wd
 
-    .global vpx_convolve8_horiz_filter_type1_neon 
-	.type vpx_convolve8_horiz_filter_type1_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_horiz_filter_type1_neon
+    .type vpx_convolve8_horiz_filter_type1_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_horiz_filter_type1_neon:
-	vpx_convolve8_horiz_filter_type1_neon: @ PROC
+vpx_convolve8_horiz_filter_type1_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -416,6 +415,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_horiz_filter_type1_neon, .-vpx_convolve8_horiz_filter_type1_neon    @ ENDP
+.size vpx_convolve8_horiz_filter_type1_neon, .-vpx_convolve8_horiz_filter_type1_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm.S
index 08ec5f9..de094b7 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_horiz_filter_type2_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r8 =>  ht
 @    r10 =>  wd
 
-    .global vpx_convolve8_horiz_filter_type2_neon 
-	.type vpx_convolve8_horiz_filter_type2_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_horiz_filter_type2_neon
+    .type vpx_convolve8_horiz_filter_type2_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_horiz_filter_type2_neon:
-	vpx_convolve8_horiz_filter_type2_neon: @ PROC
+vpx_convolve8_horiz_filter_type2_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -416,6 +415,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_horiz_filter_type2_neon, .-vpx_convolve8_horiz_filter_type2_neon    @ ENDP
+.size vpx_convolve8_horiz_filter_type2_neon, .-vpx_convolve8_horiz_filter_type2_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm.S
index ff5cf528..f7ee542 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type1_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r5 =>  ht
 @    r3 =>  wd
 
-    .global vpx_convolve8_vert_filter_type1_neon 
-	.type vpx_convolve8_vert_filter_type1_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_vert_filter_type1_neon
+    .type vpx_convolve8_vert_filter_type1_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_vert_filter_type1_neon:
-	vpx_convolve8_vert_filter_type1_neon: @ PROC
+vpx_convolve8_vert_filter_type1_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -458,6 +457,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_vert_filter_type1_neon, .-vpx_convolve8_vert_filter_type1_neon    @ ENDP
+.size vpx_convolve8_vert_filter_type1_neon, .-vpx_convolve8_vert_filter_type1_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm.S
index fc5a98a..4c256e6 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve8_vert_filter_type2_neon.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2018 The WebM project authors. All Rights Reserved.
 @
@@ -19,17 +19,16 @@
 @    r5 =>  ht
 @    r3 =>  wd
 
-    .global vpx_convolve8_vert_filter_type2_neon 
-	.type vpx_convolve8_vert_filter_type2_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve8_vert_filter_type2_neon
+    .type vpx_convolve8_vert_filter_type2_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve8_vert_filter_type2_neon:
-	vpx_convolve8_vert_filter_type2_neon: @ PROC
+vpx_convolve8_vert_filter_type2_neon: @ PROC
 
     stmfd           sp!,    {r4  -  r12,    r14} @stack stores the values of
                                                  @ the arguments
@@ -456,6 +455,6 @@
     vpop            {d8  -  d15}
     ldmfd           sp!,    {r4  -  r12,    r15} @reload the registers from sp
 
-	.size vpx_convolve8_vert_filter_type2_neon, .-vpx_convolve8_vert_filter_type2_neon    @ ENDP
+.size vpx_convolve8_vert_filter_type2_neon, .-vpx_convolve8_vert_filter_type2_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm.S
index 4b77ca9..4beac22 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve_avg_neon_asm.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,17 +11,16 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_convolve_avg_neon 
-	.type vpx_convolve_avg_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve_avg_neon
+    .type vpx_convolve_avg_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve_avg_neon:
-	vpx_convolve_avg_neon: @ PROC
+vpx_convolve_avg_neon: @ PROC
     push                {r4-r6, lr}
     ldrd                r4, r5, [sp, #36]
     mov                 r6, r2
@@ -117,6 +116,6 @@
     subs                r5, r5, #2
     bgt                 avg4
     pop                 {r4-r6, pc}
-	.size vpx_convolve_avg_neon, .-vpx_convolve_avg_neon    @ ENDP
+.size vpx_convolve_avg_neon, .-vpx_convolve_avg_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm.S b/config/arm-neon/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm.S
index ed8dfe3..6b6fb93 100644
--- a/config/arm-neon/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm.S
+++ b/config/arm-neon/vpx_dsp/arm/vpx_convolve_copy_neon_asm.asm.S
@@ -1,6 +1,6 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 @
 @  Copyright (c) 2013 The WebM project authors. All Rights Reserved.
 @
@@ -11,17 +11,16 @@
 @  be found in the AUTHORS file in the root of the source tree.
 @
 
-    .global vpx_convolve_copy_neon 
-	.type vpx_convolve_copy_neon, function
-   .arm
-   .eabi_attribute 24, 1 @Tag_ABI_align_needed
-   .eabi_attribute 25, 1 @Tag_ABI_align_preserved
+    .global vpx_convolve_copy_neon
+    .type vpx_convolve_copy_neon, function
+    .arm
+    .eabi_attribute 24, 1 @Tag_ABI_align_needed
+    .eabi_attribute 25, 1 @Tag_ABI_align_preserved
 
-.text
-.p2align 2
+    .text
+    .p2align 2
 
-_vpx_convolve_copy_neon:
-	vpx_convolve_copy_neon: @ PROC
+vpx_convolve_copy_neon: @ PROC
     push                {r4-r5, lr}
     ldrd                r4, r5, [sp, #32]
 
@@ -85,6 +84,6 @@
     subs                r5, r5, #1
     bgt                 copy4
     pop                 {r4-r5, pc}
-	.size vpx_convolve_copy_neon, .-vpx_convolve_copy_neon    @ ENDP
+.size vpx_convolve_copy_neon, .-vpx_convolve_copy_neon    @ ENDP
 
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm-neon/vpx_dsp_rtcd.h b/config/arm-neon/vpx_dsp_rtcd.h
index e6bd5b2..99abbb9 100644
--- a/config/arm-neon/vpx_dsp_rtcd.h
+++ b/config/arm-neon/vpx_dsp_rtcd.h
@@ -1045,10 +1045,10 @@
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1057,7 +1057,7 @@
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1066,7 +1066,7 @@
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1075,7 +1075,7 @@
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1084,7 +1084,7 @@
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1093,7 +1093,7 @@
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1102,7 +1102,7 @@
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1111,7 +1111,7 @@
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1120,7 +1120,7 @@
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1129,7 +1129,7 @@
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1138,7 +1138,7 @@
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1147,7 +1147,7 @@
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1156,7 +1156,7 @@
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1165,7 +1165,7 @@
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
@@ -1262,8 +1262,8 @@
 int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_neon
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -1337,12 +1337,12 @@
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_c
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_neon
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1353,16 +1353,10 @@
 unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_c
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_neon
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_neon
@@ -1371,8 +1365,8 @@
 unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1383,16 +1377,10 @@
 unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_c
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_neon
@@ -1401,8 +1389,8 @@
 unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1413,13 +1401,10 @@
 unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_neon
@@ -1428,8 +1413,8 @@
 unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1440,16 +1425,10 @@
 unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_c
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_neon
@@ -1458,8 +1437,8 @@
 unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1470,8 +1449,8 @@
 unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1482,8 +1461,8 @@
 unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1494,16 +1473,10 @@
 unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_c
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_neon
@@ -1512,8 +1485,8 @@
 unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1524,16 +1497,10 @@
 unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_c
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
-
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
diff --git a/config/arm-neon/vpx_version.h b/config/arm-neon/vpx_version.h
index 5d2835a..a90ab60 100644
--- a/config/arm-neon/vpx_version.h
+++ b/config/arm-neon/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  11
+#define VERSION_MINOR  12
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.11.0"
-#define VERSION_STRING      " v1.11.0"
+#define VERSION_STRING_NOSP "v1.12.0"
+#define VERSION_STRING      " v1.12.0"
diff --git a/config/arm64/vp8_rtcd.h b/config/arm64/vp8_rtcd.h
index 328c67e..d204ef7 100644
--- a/config/arm64/vp8_rtcd.h
+++ b/config/arm64/vp8_rtcd.h
@@ -96,9 +96,6 @@
 void vp8_fast_quantize_b_neon(struct block *, struct blockd *);
 #define vp8_fast_quantize_b vp8_fast_quantize_b_neon
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_neon(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_neon
diff --git a/config/arm64/vp9_rtcd.h b/config/arm64/vp9_rtcd.h
index 859eadd..01065e6 100644
--- a/config/arm64/vp9_rtcd.h
+++ b/config/arm64/vp9_rtcd.h
@@ -41,13 +41,16 @@
 #define vp9_diamond_search_sad vp9_diamond_search_sad_c
 
 void vp9_fht16x16_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht16x16 vp9_fht16x16_c
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht16x16 vp9_fht16x16_neon
 
 void vp9_fht4x4_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht4x4 vp9_fht4x4_c
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht4x4 vp9_fht4x4_neon
 
 void vp9_fht8x8_c(const int16_t *input, tran_low_t *output, int stride, int tx_type);
-#define vp9_fht8x8 vp9_fht8x8_c
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride, int tx_type);
+#define vp9_fht8x8 vp9_fht8x8_neon
 
 void vp9_fwht4x4_c(const int16_t *input, tran_low_t *output, int stride);
 #define vp9_fwht4x4 vp9_fwht4x4_c
@@ -79,10 +82,10 @@
 void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_neon
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -100,12 +103,12 @@
 void vp9_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_neon
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_neon
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_neon
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/arm64/vpx_config.asm b/config/arm64/vpx_config.asm
index 2b37d02..c4b840b 100644
--- a/config/arm64/vpx_config.asm
+++ b/config/arm64/vpx_config.asm
@@ -1,11 +1,12 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 .equ VPX_ARCH_ARM ,  1
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
 .equ HAVE_NEON ,  1
 .equ HAVE_NEON_ASM ,  0
 .equ HAVE_MIPS32 ,  0
@@ -23,6 +24,8 @@
 .equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
 .equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -88,4 +91,4 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/arm64/vpx_config.h b/config/arm64/vpx_config.h
index 9497732..247c0ea 100644
--- a/config/arm64/vpx_config.h
+++ b/config/arm64/vpx_config.h
@@ -15,6 +15,7 @@
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON 1
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
@@ -32,6 +33,8 @@
 #define HAVE_AVX512 0
 #define HAVE_VSX 0
 #define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
diff --git a/config/arm64/vpx_dsp_rtcd.h b/config/arm64/vpx_dsp_rtcd.h
index e6bd5b2..99abbb9 100644
--- a/config/arm64/vpx_dsp_rtcd.h
+++ b/config/arm64/vpx_dsp_rtcd.h
@@ -1045,10 +1045,10 @@
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1057,7 +1057,7 @@
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1066,7 +1066,7 @@
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1075,7 +1075,7 @@
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1084,7 +1084,7 @@
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1093,7 +1093,7 @@
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1102,7 +1102,7 @@
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1111,7 +1111,7 @@
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1120,7 +1120,7 @@
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1129,7 +1129,7 @@
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1138,7 +1138,7 @@
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1147,7 +1147,7 @@
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1156,7 +1156,7 @@
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1165,7 +1165,7 @@
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
@@ -1262,8 +1262,8 @@
 int16_t vpx_int_pro_col_neon(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_neon
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_neon(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_neon(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_neon
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -1337,12 +1337,12 @@
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_c
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_neon
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_neon
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1353,16 +1353,10 @@
 unsigned int vpx_sad16x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_neon
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_c
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_neon
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_neon
@@ -1371,8 +1365,8 @@
 unsigned int vpx_sad16x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_neon
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_neon
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1383,16 +1377,10 @@
 unsigned int vpx_sad16x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_neon
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_c
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_neon
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_neon
@@ -1401,8 +1389,8 @@
 unsigned int vpx_sad32x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_neon
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_neon
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1413,13 +1401,10 @@
 unsigned int vpx_sad32x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_neon
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_neon
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_neon
@@ -1428,8 +1413,8 @@
 unsigned int vpx_sad32x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_neon
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_neon
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1440,16 +1425,10 @@
 unsigned int vpx_sad4x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_neon
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_c
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_neon
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_neon
@@ -1458,8 +1437,8 @@
 unsigned int vpx_sad4x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_neon
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_neon
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1470,8 +1449,8 @@
 unsigned int vpx_sad64x32_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_neon
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_neon
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1482,8 +1461,8 @@
 unsigned int vpx_sad64x64_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_neon
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_neon
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1494,16 +1473,10 @@
 unsigned int vpx_sad8x16_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_neon
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_c
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_neon
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_neon
@@ -1512,8 +1485,8 @@
 unsigned int vpx_sad8x4_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_neon
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_neon
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1524,16 +1497,10 @@
 unsigned int vpx_sad8x8_avg_neon(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_neon
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_c
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_neon
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
-
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_neon(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_neon
diff --git a/config/arm64/vpx_version.h b/config/arm64/vpx_version.h
index 5d2835a..a90ab60 100644
--- a/config/arm64/vpx_version.h
+++ b/config/arm64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  11
+#define VERSION_MINOR  12
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.11.0"
-#define VERSION_STRING      " v1.11.0"
+#define VERSION_STRING_NOSP "v1.12.0"
+#define VERSION_STRING      " v1.12.0"
diff --git a/config/generic/vp8_rtcd.h b/config/generic/vp8_rtcd.h
index 33c0582..99cc2f7 100644
--- a/config/generic/vp8_rtcd.h
+++ b/config/generic/vp8_rtcd.h
@@ -81,9 +81,6 @@
 void vp8_fast_quantize_b_c(struct block *, struct blockd *);
 #define vp8_fast_quantize_b vp8_fast_quantize_b_c
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sad_c
-
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_c
 
diff --git a/config/generic/vp9_rtcd.h b/config/generic/vp9_rtcd.h
index 3c9e237..07d2453 100644
--- a/config/generic/vp9_rtcd.h
+++ b/config/generic/vp9_rtcd.h
@@ -76,10 +76,10 @@
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -94,10 +94,10 @@
 void vp9_iht8x8_64_add_c(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_c
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_c
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/generic/vpx_config.asm b/config/generic/vpx_config.asm
index b0675f9..ffeb85e 100644
--- a/config/generic/vpx_config.asm
+++ b/config/generic/vpx_config.asm
@@ -1,11 +1,12 @@
 @ This file was created from a .asm file
 @  using the ads2gas.pl script.
-	.syntax unified
+.syntax unified
 .equ VPX_ARCH_ARM ,  0
 .equ VPX_ARCH_MIPS ,  0
 .equ VPX_ARCH_X86 ,  0
 .equ VPX_ARCH_X86_64 ,  0
 .equ VPX_ARCH_PPC ,  0
+.equ VPX_ARCH_LOONGARCH ,  0
 .equ HAVE_NEON ,  0
 .equ HAVE_NEON_ASM ,  0
 .equ HAVE_MIPS32 ,  0
@@ -23,6 +24,8 @@
 .equ HAVE_AVX512 ,  0
 .equ HAVE_VSX ,  0
 .equ HAVE_MMI ,  0
+.equ HAVE_LSX ,  0
+.equ HAVE_LASX ,  0
 .equ HAVE_VPX_PORTS ,  1
 .equ HAVE_PTHREAD_H ,  1
 .equ HAVE_UNISTD_H ,  1
@@ -88,4 +91,4 @@
 .equ CONFIG_EMULATE_HARDWARE ,  0
 .equ CONFIG_NON_GREEDY_MV ,  0
 .equ CONFIG_RATE_CTRL ,  0
-	.section	.note.GNU-stack,"",%progbits
+    .section .note.GNU-stack,"",%progbits
diff --git a/config/generic/vpx_config.h b/config/generic/vpx_config.h
index 9ef767d..c9d8393 100644
--- a/config/generic/vpx_config.h
+++ b/config/generic/vpx_config.h
@@ -15,6 +15,7 @@
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
@@ -32,6 +33,8 @@
 #define HAVE_AVX512 0
 #define HAVE_VSX 0
 #define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
diff --git a/config/generic/vpx_dsp_rtcd.h b/config/generic/vpx_dsp_rtcd.h
index 89909af..328601f 100644
--- a/config/generic/vpx_dsp_rtcd.h
+++ b/config/generic/vpx_dsp_rtcd.h
@@ -930,10 +930,10 @@
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_c
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_c
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -942,7 +942,7 @@
 unsigned int vpx_highbd_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_c
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_c
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -951,7 +951,7 @@
 unsigned int vpx_highbd_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_c
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_c
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -960,7 +960,7 @@
 unsigned int vpx_highbd_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_c
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_c
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -969,7 +969,7 @@
 unsigned int vpx_highbd_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_c
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_c
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -978,7 +978,7 @@
 unsigned int vpx_highbd_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_c
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_c
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -987,7 +987,7 @@
 unsigned int vpx_highbd_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_c
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_c
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -996,7 +996,7 @@
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_c
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1005,7 +1005,7 @@
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_c
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1014,7 +1014,7 @@
 unsigned int vpx_highbd_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_c
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_c
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1023,7 +1023,7 @@
 unsigned int vpx_highbd_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_c
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_c
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1032,7 +1032,7 @@
 unsigned int vpx_highbd_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_c
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_c
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1041,7 +1041,7 @@
 unsigned int vpx_highbd_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_c
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_c
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1050,7 +1050,7 @@
 unsigned int vpx_highbd_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_c
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_c
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
@@ -1125,7 +1125,7 @@
 int16_t vpx_int_pro_col_c(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_c
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_c
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -1185,10 +1185,10 @@
 unsigned int vpx_mse8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse);
 #define vpx_mse8x8 vpx_mse8x8_c
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_c
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_c
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1197,22 +1197,16 @@
 unsigned int vpx_sad16x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_c
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_c
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_c
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_c
 
 unsigned int vpx_sad16x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_c
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_c
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1221,22 +1215,16 @@
 unsigned int vpx_sad16x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_c
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_c
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_c
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_c
 
 unsigned int vpx_sad32x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_c
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_c
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1245,19 +1233,16 @@
 unsigned int vpx_sad32x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_c
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_c
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_c
 
 unsigned int vpx_sad32x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_c
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_c
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1266,22 +1251,16 @@
 unsigned int vpx_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_c
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_c
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_c
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_c
 
 unsigned int vpx_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_c
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_c
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1290,7 +1269,7 @@
 unsigned int vpx_sad64x32_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_c
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_c
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1299,7 +1278,7 @@
 unsigned int vpx_sad64x64_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_c
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_c
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1308,22 +1287,16 @@
 unsigned int vpx_sad8x16_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_c
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_c
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_c
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_c
 
 unsigned int vpx_sad8x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_c
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_c
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1332,15 +1305,9 @@
 unsigned int vpx_sad8x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_c
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_c
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_c
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
-
 int vpx_satd_c(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_c
 
diff --git a/config/generic/vpx_version.h b/config/generic/vpx_version.h
index 5d2835a..a90ab60 100644
--- a/config/generic/vpx_version.h
+++ b/config/generic/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  11
+#define VERSION_MINOR  12
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.11.0"
-#define VERSION_STRING      " v1.11.0"
+#define VERSION_STRING_NOSP "v1.12.0"
+#define VERSION_STRING      " v1.12.0"
diff --git a/config/x86/vp8_rtcd.h b/config/x86/vp8_rtcd.h
index c0d3b16..5f7b326 100644
--- a/config/x86/vp8_rtcd.h
+++ b/config/x86/vp8_rtcd.h
@@ -123,10 +123,6 @@
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sadx3
-
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
diff --git a/config/x86/vp9_rtcd.h b/config/x86/vp9_rtcd.h
index 6706d80..cff5e7f 100644
--- a/config/x86/vp9_rtcd.h
+++ b/config/x86/vp9_rtcd.h
@@ -83,10 +83,10 @@
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -104,11 +104,11 @@
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_sse2
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_c
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86/vpx_config.asm b/config/x86/vpx_config.asm
index acae8bd..8c108ef 100644
--- a/config/x86/vpx_config.asm
+++ b/config/x86/vpx_config.asm
@@ -3,6 +3,7 @@
 %define VPX_ARCH_X86 1
 %define VPX_ARCH_X86_64 0
 %define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON 0
 %define HAVE_NEON_ASM 0
 %define HAVE_MIPS32 0
@@ -20,6 +21,8 @@
 %define HAVE_AVX512 0
 %define HAVE_VSX 0
 %define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define HAVE_UNISTD_H 1
diff --git a/config/x86/vpx_config.h b/config/x86/vpx_config.h
index 4fad516..6cc7eda 100644
--- a/config/x86/vpx_config.h
+++ b/config/x86/vpx_config.h
@@ -15,6 +15,7 @@
 #define VPX_ARCH_X86 1
 #define VPX_ARCH_X86_64 0
 #define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
@@ -32,6 +33,8 @@
 #define HAVE_AVX512 0
 #define HAVE_VSX 0
 #define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
diff --git a/config/x86/vpx_dsp_rtcd.h b/config/x86/vpx_dsp_rtcd.h
index 91ce2e1..8b94dd8 100644
--- a/config/x86/vpx_dsp_rtcd.h
+++ b/config/x86/vpx_dsp_rtcd.h
@@ -1184,12 +1184,12 @@
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1200,8 +1200,8 @@
 unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1212,8 +1212,8 @@
 unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1224,8 +1224,8 @@
 unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1236,8 +1236,8 @@
 unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1248,8 +1248,8 @@
 unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1260,8 +1260,8 @@
 unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1270,8 +1270,8 @@
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1280,8 +1280,8 @@
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1292,8 +1292,8 @@
 unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1304,8 +1304,8 @@
 unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1316,8 +1316,8 @@
 unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1328,8 +1328,8 @@
 unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1340,8 +1340,8 @@
 unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
@@ -1441,8 +1441,8 @@
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_sse2
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -1536,13 +1536,13 @@
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_ssse3
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1553,18 +1553,10 @@
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_ssse3
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_sse2
@@ -1573,8 +1565,8 @@
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1585,18 +1577,10 @@
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_ssse3
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_sse2
@@ -1605,8 +1589,8 @@
 unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1617,13 +1601,10 @@
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_sse2
@@ -1632,8 +1613,8 @@
 unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1644,17 +1625,10 @@
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_sse3
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_sse2
@@ -1663,8 +1637,8 @@
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1675,8 +1649,8 @@
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1687,8 +1661,8 @@
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1699,17 +1673,10 @@
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_sse3
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_sse2
@@ -1718,8 +1685,8 @@
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1730,17 +1697,10 @@
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_sse3
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
-
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
diff --git a/config/x86/vpx_version.h b/config/x86/vpx_version.h
index 5d2835a..a90ab60 100644
--- a/config/x86/vpx_version.h
+++ b/config/x86/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  11
+#define VERSION_MINOR  12
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.11.0"
-#define VERSION_STRING      " v1.11.0"
+#define VERSION_STRING_NOSP "v1.12.0"
+#define VERSION_STRING      " v1.12.0"
diff --git a/config/x86_64/vp8_rtcd.h b/config/x86_64/vp8_rtcd.h
index c0d3b16..5f7b326 100644
--- a/config/x86_64/vp8_rtcd.h
+++ b/config/x86_64/vp8_rtcd.h
@@ -123,10 +123,6 @@
 void vp8_filter_by_weight8x8_sse2(unsigned char *src, int src_stride, unsigned char *dst, int dst_stride, int src_weight);
 #define vp8_filter_by_weight8x8 vp8_filter_by_weight8x8_sse2
 
-int vp8_full_search_sad_c(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-int vp8_full_search_sadx3(struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv);
-#define vp8_full_search_sad vp8_full_search_sadx3
-
 void vp8_loop_filter_bh_c(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 void vp8_loop_filter_bh_sse2(unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi);
 #define vp8_loop_filter_bh vp8_loop_filter_bh_sse2
diff --git a/config/x86_64/vp9_rtcd.h b/config/x86_64/vp9_rtcd.h
index 272cfe6..580d55a 100644
--- a/config/x86_64/vp9_rtcd.h
+++ b/config/x86_64/vp9_rtcd.h
@@ -83,10 +83,10 @@
 void vp9_highbd_iht8x8_64_add_c(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd);
 #define vp9_highbd_iht8x8_64_add vp9_highbd_iht8x8_64_add_c
 
-void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp vp9_highbd_quantize_fp_c
 
-void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_highbd_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_highbd_quantize_fp_32x32 vp9_highbd_quantize_fp_32x32_c
 
 void vp9_highbd_temporal_filter_apply_c(const uint8_t *frame1, unsigned int stride, const uint8_t *frame2, unsigned int block_width, unsigned int block_height, int strength, int *blk_fw, int use_32x32, uint32_t *accumulator, uint16_t *count);
@@ -104,13 +104,13 @@
 void vp9_iht8x8_64_add_sse2(const tran_low_t *input, uint8_t *dest, int stride, int tx_type);
 #define vp9_iht8x8_64_add vp9_iht8x8_64_add_sse2
 
-void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp vp9_quantize_fp_ssse3
 
-void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vp9_quantize_fp_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vp9_quantize_fp_32x32 vp9_quantize_fp_32x32_ssse3
 
 void vp9_scale_and_extend_frame_c(const struct yv12_buffer_config *src, struct yv12_buffer_config *dst, INTERP_FILTER filter_type, int phase_scaler);
diff --git a/config/x86_64/vpx_config.asm b/config/x86_64/vpx_config.asm
index 57df4e0..fdc51d3 100644
--- a/config/x86_64/vpx_config.asm
+++ b/config/x86_64/vpx_config.asm
@@ -3,6 +3,7 @@
 %define VPX_ARCH_X86 0
 %define VPX_ARCH_X86_64 1
 %define VPX_ARCH_PPC 0
+%define VPX_ARCH_LOONGARCH 0
 %define HAVE_NEON 0
 %define HAVE_NEON_ASM 0
 %define HAVE_MIPS32 0
@@ -20,6 +21,8 @@
 %define HAVE_AVX512 0
 %define HAVE_VSX 0
 %define HAVE_MMI 0
+%define HAVE_LSX 0
+%define HAVE_LASX 0
 %define HAVE_VPX_PORTS 1
 %define HAVE_PTHREAD_H 1
 %define HAVE_UNISTD_H 1
diff --git a/config/x86_64/vpx_config.h b/config/x86_64/vpx_config.h
index 86744a0..c624a9f 100644
--- a/config/x86_64/vpx_config.h
+++ b/config/x86_64/vpx_config.h
@@ -15,6 +15,7 @@
 #define VPX_ARCH_X86 0
 #define VPX_ARCH_X86_64 1
 #define VPX_ARCH_PPC 0
+#define VPX_ARCH_LOONGARCH 0
 #define HAVE_NEON 0
 #define HAVE_NEON_ASM 0
 #define HAVE_MIPS32 0
@@ -32,6 +33,8 @@
 #define HAVE_AVX512 0
 #define HAVE_VSX 0
 #define HAVE_MMI 0
+#define HAVE_LSX 0
+#define HAVE_LASX 0
 #define HAVE_VPX_PORTS 1
 #define HAVE_PTHREAD_H 1
 #define HAVE_UNISTD_H 1
diff --git a/config/x86_64/vpx_dsp_rtcd.h b/config/x86_64/vpx_dsp_rtcd.h
index 19bf17e..284453f 100644
--- a/config/x86_64/vpx_dsp_rtcd.h
+++ b/config/x86_64/vpx_dsp_rtcd.h
@@ -1191,12 +1191,12 @@
 void vpx_highbd_minmax_8x8_c(const uint8_t *s8, int p, const uint8_t *d8, int dp, int *min, int *max);
 #define vpx_highbd_minmax_8x8 vpx_highbd_minmax_8x8_c
 
-void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b vpx_highbd_quantize_b_sse2
 
-void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_highbd_quantize_b_32x32_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_highbd_quantize_b_32x32 vpx_highbd_quantize_b_32x32_sse2
 
 unsigned int vpx_highbd_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1207,8 +1207,8 @@
 unsigned int vpx_highbd_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x16_avg vpx_highbd_sad16x16_avg_sse2
 
-void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x16x4d vpx_highbd_sad16x16x4d_sse2
 
 unsigned int vpx_highbd_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1219,8 +1219,8 @@
 unsigned int vpx_highbd_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x32_avg vpx_highbd_sad16x32_avg_sse2
 
-void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x32x4d vpx_highbd_sad16x32x4d_sse2
 
 unsigned int vpx_highbd_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1231,8 +1231,8 @@
 unsigned int vpx_highbd_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad16x8_avg vpx_highbd_sad16x8_avg_sse2
 
-void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad16x8x4d vpx_highbd_sad16x8x4d_sse2
 
 unsigned int vpx_highbd_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1243,8 +1243,8 @@
 unsigned int vpx_highbd_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x16_avg vpx_highbd_sad32x16_avg_sse2
 
-void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x16x4d vpx_highbd_sad32x16x4d_sse2
 
 unsigned int vpx_highbd_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1255,8 +1255,8 @@
 unsigned int vpx_highbd_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x32_avg vpx_highbd_sad32x32_avg_sse2
 
-void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x32x4d vpx_highbd_sad32x32x4d_sse2
 
 unsigned int vpx_highbd_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1267,8 +1267,8 @@
 unsigned int vpx_highbd_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad32x64_avg vpx_highbd_sad32x64_avg_sse2
 
-void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad32x64x4d vpx_highbd_sad32x64x4d_sse2
 
 unsigned int vpx_highbd_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1277,8 +1277,8 @@
 unsigned int vpx_highbd_sad4x4_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x4_avg vpx_highbd_sad4x4_avg_c
 
-void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x4x4d vpx_highbd_sad4x4x4d_sse2
 
 unsigned int vpx_highbd_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1287,8 +1287,8 @@
 unsigned int vpx_highbd_sad4x8_avg_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad4x8_avg vpx_highbd_sad4x8_avg_c
 
-void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad4x8x4d vpx_highbd_sad4x8x4d_sse2
 
 unsigned int vpx_highbd_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1299,8 +1299,8 @@
 unsigned int vpx_highbd_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x32_avg vpx_highbd_sad64x32_avg_sse2
 
-void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x32x4d vpx_highbd_sad64x32x4d_sse2
 
 unsigned int vpx_highbd_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1311,8 +1311,8 @@
 unsigned int vpx_highbd_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad64x64_avg vpx_highbd_sad64x64_avg_sse2
 
-void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad64x64x4d vpx_highbd_sad64x64x4d_sse2
 
 unsigned int vpx_highbd_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1323,8 +1323,8 @@
 unsigned int vpx_highbd_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x16_avg vpx_highbd_sad8x16_avg_sse2
 
-void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x16x4d vpx_highbd_sad8x16x4d_sse2
 
 unsigned int vpx_highbd_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1335,8 +1335,8 @@
 unsigned int vpx_highbd_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x4_avg vpx_highbd_sad8x4_avg_sse2
 
-void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x4x4d vpx_highbd_sad8x4x4d_sse2
 
 unsigned int vpx_highbd_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1347,8 +1347,8 @@
 unsigned int vpx_highbd_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_highbd_sad8x8_avg vpx_highbd_sad8x8_avg_sse2
 
-void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_highbd_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_highbd_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_highbd_sad8x8x4d vpx_highbd_sad8x8x4d_sse2
 
 int vpx_highbd_satd_c(const tran_low_t *coeff, int length);
@@ -1448,8 +1448,8 @@
 int16_t vpx_int_pro_col_sse2(const uint8_t *ref, const int width);
 #define vpx_int_pro_col vpx_int_pro_col_sse2
 
-void vpx_int_pro_row_c(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_c(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height);
 #define vpx_int_pro_row vpx_int_pro_row_sse2
 
 void vpx_iwht4x4_16_add_c(const tran_low_t *input, uint8_t *dest, int stride);
@@ -1543,13 +1543,13 @@
 void vpx_post_proc_down_and_across_mb_row_sse2(unsigned char *src, unsigned char *dst, int src_pitch, int dst_pitch, int cols, unsigned char *flimits, int size);
 #define vpx_post_proc_down_and_across_mb_row vpx_post_proc_down_and_across_mb_row_sse2
 
-void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b vpx_quantize_b_ssse3
 
-void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
-void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
+void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan);
 #define vpx_quantize_b_32x32 vpx_quantize_b_32x32_ssse3
 
 unsigned int vpx_sad16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1560,18 +1560,10 @@
 unsigned int vpx_sad16x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x16_avg vpx_sad16x16_avg_sse2
 
-void vpx_sad16x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x3 vpx_sad16x16x3_ssse3
-
-void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x16x4d vpx_sad16x16x4d_sse2
 
-void vpx_sad16x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x16x8 vpx_sad16x16x8_c
-
 unsigned int vpx_sad16x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad16x32_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad16x32 vpx_sad16x32_sse2
@@ -1580,8 +1572,8 @@
 unsigned int vpx_sad16x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x32_avg vpx_sad16x32_avg_sse2
 
-void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x32x4d vpx_sad16x32x4d_sse2
 
 unsigned int vpx_sad16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1592,18 +1584,10 @@
 unsigned int vpx_sad16x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad16x8_avg vpx_sad16x8_avg_sse2
 
-void vpx_sad16x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x3_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x3 vpx_sad16x8x3_ssse3
-
-void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad16x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad16x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad16x8x4d vpx_sad16x8x4d_sse2
 
-void vpx_sad16x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad16x8x8 vpx_sad16x8x8_c
-
 unsigned int vpx_sad32x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x16_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x16 vpx_sad32x16_sse2
@@ -1612,8 +1596,8 @@
 unsigned int vpx_sad32x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x16_avg vpx_sad32x16_avg_sse2
 
-void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x16x4d vpx_sad32x16x4d_sse2
 
 unsigned int vpx_sad32x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1624,13 +1608,10 @@
 unsigned int vpx_sad32x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x32_avg vpx_sad32x32_avg_sse2
 
-void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x32x4d vpx_sad32x32x4d_sse2
 
-void vpx_sad32x32x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad32x32x8 vpx_sad32x32x8_c
-
 unsigned int vpx_sad32x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad32x64_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad32x64 vpx_sad32x64_sse2
@@ -1639,8 +1620,8 @@
 unsigned int vpx_sad32x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad32x64_avg vpx_sad32x64_avg_sse2
 
-void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad32x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad32x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad32x64x4d vpx_sad32x64x4d_sse2
 
 unsigned int vpx_sad4x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1651,17 +1632,10 @@
 unsigned int vpx_sad4x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x4_avg vpx_sad4x4_avg_sse2
 
-void vpx_sad4x4x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x3 vpx_sad4x4x3_sse3
-
-void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x4x4d vpx_sad4x4x4d_sse2
 
-void vpx_sad4x4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad4x4x8 vpx_sad4x4x8_c
-
 unsigned int vpx_sad4x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad4x8_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad4x8 vpx_sad4x8_sse2
@@ -1670,8 +1644,8 @@
 unsigned int vpx_sad4x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad4x8_avg vpx_sad4x8_avg_sse2
 
-void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad4x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad4x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad4x8x4d vpx_sad4x8x4d_sse2
 
 unsigned int vpx_sad64x32_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1682,8 +1656,8 @@
 unsigned int vpx_sad64x32_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x32_avg vpx_sad64x32_avg_sse2
 
-void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x32x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x32x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x32x4d vpx_sad64x32x4d_sse2
 
 unsigned int vpx_sad64x64_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1694,8 +1668,8 @@
 unsigned int vpx_sad64x64_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad64x64_avg vpx_sad64x64_avg_sse2
 
-void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad64x64x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad64x64x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad64x64x4d vpx_sad64x64x4d_sse2
 
 unsigned int vpx_sad8x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1706,17 +1680,10 @@
 unsigned int vpx_sad8x16_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x16_avg vpx_sad8x16_avg_sse2
 
-void vpx_sad8x16x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x3 vpx_sad8x16x3_sse3
-
-void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x16x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x16x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x16x4d vpx_sad8x16x4d_sse2
 
-void vpx_sad8x16x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x16x8 vpx_sad8x16x8_c
-
 unsigned int vpx_sad8x4_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 unsigned int vpx_sad8x4_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
 #define vpx_sad8x4 vpx_sad8x4_sse2
@@ -1725,8 +1692,8 @@
 unsigned int vpx_sad8x4_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x4_avg vpx_sad8x4_avg_sse2
 
-void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x4x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x4x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x4x4d vpx_sad8x4x4d_sse2
 
 unsigned int vpx_sad8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride);
@@ -1737,17 +1704,10 @@
 unsigned int vpx_sad8x8_avg_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred);
 #define vpx_sad8x8_avg vpx_sad8x8_avg_sse2
 
-void vpx_sad8x8x3_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x3_sse3(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x3 vpx_sad8x8x3_sse3
-
-void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
-void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array);
+void vpx_sad8x8x4d_c(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
+void vpx_sad8x8x4d_sse2(const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]);
 #define vpx_sad8x8x4d vpx_sad8x8x4d_sse2
 
-void vpx_sad8x8x8_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array);
-#define vpx_sad8x8x8 vpx_sad8x8x8_c
-
 int vpx_satd_c(const tran_low_t *coeff, int length);
 int vpx_satd_sse2(const tran_low_t *coeff, int length);
 #define vpx_satd vpx_satd_sse2
diff --git a/config/x86_64/vpx_version.h b/config/x86_64/vpx_version.h
index 5d2835a..a90ab60 100644
--- a/config/x86_64/vpx_version.h
+++ b/config/x86_64/vpx_version.h
@@ -1,8 +1,8 @@
 // This file is generated. Do not edit.
 #define VERSION_MAJOR  1
-#define VERSION_MINOR  11
+#define VERSION_MINOR  12
 #define VERSION_PATCH  0
 #define VERSION_EXTRA  ""
 #define VERSION_PACKED ((VERSION_MAJOR<<16)|(VERSION_MINOR<<8)|(VERSION_PATCH))
-#define VERSION_STRING_NOSP "v1.11.0"
-#define VERSION_STRING      " v1.11.0"
+#define VERSION_STRING_NOSP "v1.12.0"
+#define VERSION_STRING      " v1.12.0"
diff --git a/libvpx/AUTHORS b/libvpx/AUTHORS
index 174cc59..fffda63 100644
--- a/libvpx/AUTHORS
+++ b/libvpx/AUTHORS
@@ -68,11 +68,13 @@
 Hangyu Kuang <hkuang@google.com>
 Hanno Böck <hanno@hboeck.de>
 Han Shen <shenhan@google.com>
+Hao Chen <chenhao@loongson.cn>
 Harish Mahendrakar <harish.mahendrakar@ittiam.com>
 Henrik Lundin <hlundin@google.com>
 Hien Ho <hienho@google.com>
 Hirokazu Honda <hiroh@chromium.org>
 Hui Su <huisu@google.com>
+Ilya Kurdyukov <jpegqs@gmail.com>
 Ivan Krasin <krasin@chromium.org>
 Ivan Maltz <ivanmaltz@google.com>
 Jacek Caban <cjacek@gmail.com>
@@ -91,9 +93,11 @@
 Jeremy Leconte <jleconte@google.com>
 Jerome Jiang <jianj@google.com>
 Jia Jia <jia.jia@linaro.org>
+Jianhui Dai <jianhui.j.dai@intel.com>
 Jian Zhou <zhoujian@google.com>
 Jim Bankoski <jimbankoski@google.com>
 jinbo <jinbo-hf@loongson.cn>
+Jin Bo <jinbo@loongson.cn>
 Jingning Han <jingning@google.com>
 Joel Fernandes <joelaf@google.com>
 Joey Parrish <joeyparrish@google.com>
@@ -111,6 +115,7 @@
 Justin Lebar <justin.lebar@gmail.com>
 Kaustubh Raste <kaustubh.raste@imgtec.com>
 KO Myung-Hun <komh@chollian.net>
+Konstantinos Margaritis <konma@vectorcamp.gr>
 Kyle Siefring <kylesiefring@gmail.com>
 Lawrence Velázquez <larryv@macports.org>
 Linfeng Zhang <linfengz@google.com>
@@ -118,6 +123,7 @@
 Lou Quillio <louquillio@google.com>
 Luca Barbato <lu_zero@gentoo.org>
 Luc Trudeau <luc@trud.ca>
+Lu Wang <wanglu@loongson.cn>
 Makoto Kato <makoto.kt@gmail.com>
 Mans Rullgard <mans@mansr.com>
 Marco Paniconi <marpan@google.com>
@@ -131,6 +137,7 @@
 Mike Frysinger <vapier@chromium.org>
 Mike Hommey <mhommey@mozilla.com>
 Mikhal Shemer <mikhal@google.com>
+Mikko Koivisto <mikko.koivisto@unikie.com>
 Min Chen <chenm003@gmail.com>
 Minghai Shang <minghai@google.com>
 Min Ye <yeemmi@google.com>
@@ -206,6 +213,7 @@
 Yaowu Xu <yaowu@google.com>
 Yi Luo <luoyi@google.com>
 Yongzhe Wang <yongzhe@google.com>
+yuanhecai <yuanhecai@loongson.cn>
 Yue Chen <yuec@google.com>
 Yun Liu <yliuyliu@google.com>
 Yunqing Wang <yunqingwang@google.com>
diff --git a/libvpx/CHANGELOG b/libvpx/CHANGELOG
index ea2fc9d..cd4e8ba 100644
--- a/libvpx/CHANGELOG
+++ b/libvpx/CHANGELOG
@@ -1,3 +1,35 @@
+2022-06-17 v1.12.0 "Torrent Duck"
+  This release adds optimizations for Loongarch, adds support for vp8 in the
+  real-time rate control library, upgrades GoogleTest to v1.11.0, updates
+  libwebm to libwebm-1.0.0.28-20-g206d268, and includes numerous bug fixes.
+
+  - Upgrading:
+    This release is ABI compatible with the previous release.
+
+    vp8 support in the real-time rate control library.
+    New codec control VP8E_SET_RTC_EXTERNAL_RATECTRL is added.
+
+    Configure support for darwin21 is added.
+
+    GoogleTest is upgraded to v1.11.0.
+
+    libwebm is updated to libwebm-1.0.0.28-20-g206d268.
+
+    Allow SimpleEncode environment to take target level as input to match
+    the level conformance in vp9.
+
+  - Enhancement:
+    Numerous improvements on checking memory allocations.
+    Optimizations for Loongarch.
+    Code clean-up.
+
+  - Bug fixes:
+    Fix to a crash related to {vp8/vp9}_set_roi_map.
+    Fix to compiling failure with -Wformat-nonliteral.
+    Fix to integer overflow with vp9 with high resolution content.
+    Fix to AddNoiseTest failure with ARMv7.
+    Fix to libvpx Null-dereference READ in vp8.
+
 2021-09-27 v1.11.0 "Smew Duck"
   This maintenance release adds support for VBR mode in VP9 rate control
   interface, new codec controls to get quantization parameters and loop filter
diff --git a/libvpx/README b/libvpx/README
index ddbcb9f..477a145 100644
--- a/libvpx/README
+++ b/libvpx/README
@@ -1,4 +1,4 @@
-README - 08 March 2021
+v1.12.0 Torrent Duck
 
 Welcome to the WebM VP8/VP9 Codec SDK!
 
@@ -103,6 +103,7 @@
     x86-win32-vs14
     x86-win32-vs15
     x86-win32-vs16
+    x86-win32-vs17
     x86_64-android-gcc
     x86_64-darwin9-gcc
     x86_64-darwin10-gcc
@@ -124,6 +125,7 @@
     x86_64-win64-vs14
     x86_64-win64-vs15
     x86_64-win64-vs16
+    x86_64-win64-vs17
     generic-gnu
 
   The generic-gnu target, in conjunction with the CROSS environment variable,
diff --git a/libvpx/args.c b/libvpx/args.c
index 17b6155..4afb9c0 100644
--- a/libvpx/args.c
+++ b/libvpx/args.c
@@ -83,6 +83,7 @@
 
 char **argv_dup(int argc, const char **argv) {
   char **new_argv = malloc((argc + 1) * sizeof(*argv));
+  if (!new_argv) return NULL;
 
   memcpy(new_argv, argv, argc * sizeof(*argv));
   new_argv[argc] = NULL;
diff --git a/libvpx/build/make/Makefile b/libvpx/build/make/Makefile
index 9ca97c8..b7a873c 100644
--- a/libvpx/build/make/Makefile
+++ b/libvpx/build/make/Makefile
@@ -151,6 +151,12 @@
 $(BUILD_PFX)%_msa.c.d: CFLAGS += -mmsa
 $(BUILD_PFX)%_msa.c.o: CFLAGS += -mmsa
 
+# LOONGARCH
+$(BUILD_PFX)%_lsx.c.d:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lsx.c.o:  CFLAGS += -mlsx
+$(BUILD_PFX)%_lasx.c.d: CFLAGS += -mlasx
+$(BUILD_PFX)%_lasx.c.o: CFLAGS += -mlasx
+
 $(BUILD_PFX)%.c.d: %.c
 	$(if $(quiet),@echo "    [DEP] $@")
 	$(qexec)mkdir -p $(dir $@)
diff --git a/libvpx/build/make/ads2armasm_ms.pl b/libvpx/build/make/ads2armasm_ms.pl
index 2a2c470..dd4e031 100755
--- a/libvpx/build/make/ads2armasm_ms.pl
+++ b/libvpx/build/make/ads2armasm_ms.pl
@@ -28,7 +28,7 @@
     s/qsubaddx/qsax/i;
     s/qaddsubx/qasx/i;
 
-    thumb::FixThumbInstructions($_, 1);
+    thumb::FixThumbInstructions($_);
 
     s/ldrneb/ldrbne/i;
     s/ldrneh/ldrhne/i;
diff --git a/libvpx/build/make/ads2gas.pl b/libvpx/build/make/ads2gas.pl
index b6a8f53..c301b7f 100755
--- a/libvpx/build/make/ads2gas.pl
+++ b/libvpx/build/make/ads2gas.pl
@@ -32,7 +32,7 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas.pl script.\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 if ($thumb) {
     print "\t.thumb\n";
 }
@@ -42,39 +42,11 @@
 
 while (<STDIN>)
 {
-    undef $comment;
-    undef $line;
-    $comment_char = ";";
-    $comment_sub = "@";
-
-    # Handle comments.
-    if (/$comment_char/)
-    {
-      $comment = "";
-      ($line, $comment) = /(.*?)$comment_char(.*)/;
-      $_ = $line;
-    }
-
     # Load and store alignment
     s/@/,:/g;
 
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    # Comment character
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -82,105 +54,51 @@
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert endfunc to nothing.
-    s/\bendfunc\b//ig;
-
-    # Convert FUNCTION to nothing.
-    s/\bFUNCTION\b//g;
-    s/\bfunction\b//g;
-
-    s/\bENTRY\b//g;
-    s/\bMSARMASM\b/0/g;
-    s/^\s+end\s+$//g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
+    if (s/\bIF\b/.if/g) {
         s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
-
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
-
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
-
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
+    # Make function visible to linker.
     if ($elf) {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1 \n\t.type $1, function/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2\n$1.type $2, function/;
     } else {
-        s/EXPORT\s+\|([\$\w]*)\|/.global $1/;
+        s/(\s*)EXPORT\s+\|([\$\w]*)\|/$1.global $2/;
     }
-    s/IMPORT\s+\|([\$\w]*)\|/.global $1/;
 
-    s/EXPORT\s+([\$\w]*)/.global $1/;
-    s/export\s+([\$\w]*)/.global $1/;
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
     # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
     s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     if ($thumb) {
-        # ARM code - we force everything to thumb with the declaration in the header
-        s/\sARM//g;
+        # ARM code - we force everything to thumb with the declaration in the
+        # header
+        s/\bARM\b//g;
     } else {
         # ARM code
-        s/\sARM/.arm/g;
+        s/\bARM\b/.arm/g;
     }
 
     # push/pop
     s/(push\s+)(r\d+)/stmdb sp\!, \{$2\}/g;
     s/(pop\s+)(r\d+)/ldmia sp\!, \{$2\}/g;
 
-    # NEON code
-    s/(vld1.\d+\s+)(q\d+)/$1\{$2\}/g;
-    s/(vtbl.\d+\s+[^,]+),([^,]+)/$1,\{$2\}/g;
-
     if ($thumb) {
-        thumb::FixThumbInstructions($_, 0);
+        thumb::FixThumbInstructions($_);
     }
 
     # eabi_attributes numerical equivalents can be found in the
@@ -188,31 +106,31 @@
 
     if ($elf) {
         # REQUIRE8 Stack is required to be 8-byte aligned
-        s/\sREQUIRE8/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
+        s/\bREQUIRE8\b/.eabi_attribute 24, 1 \@Tag_ABI_align_needed/g;
 
         # PRESERVE8 Stack 8-byte align is preserved
-        s/\sPRESERVE8/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
+        s/\bPRESERVE8\b/.eabi_attribute 25, 1 \@Tag_ABI_align_preserved/g;
     } else {
-        s/\sREQUIRE8//;
-        s/\sPRESERVE8//;
+        s/\bREQUIRE8\b//;
+        s/\bPRESERVE8\b//;
     }
 
     # Use PROC and ENDP to give the symbols a .size directive.
     # This makes them show up properly in debugging tools like gdb and valgrind.
-    if (/\bPROC\b/)
-    {
+    if (/\bPROC\b/) {
         my $proc;
-        /^_([\.0-9A-Z_a-z]\w+)\b/;
+        # Match the function name so it can be stored in $proc
+        /^([\.0-9A-Z_a-z]\w+)\b/;
         $proc = $1;
         push(@proc_stack, $proc) if ($proc);
         s/\bPROC\b/@ $&/;
     }
-    if (/\bENDP\b/)
-    {
+
+    if (/\bENDP\b/) {
         my $proc;
         s/\bENDP\b/@ $&/;
         $proc = pop(@proc_stack);
-        $_ = "\t.size $proc, .-$proc".$_ if ($proc and $elf);
+        $_ = ".size $proc, .-$proc".$_ if ($proc and $elf);
     }
 
     # EQU directive
@@ -220,19 +138,20 @@
 
     # Begin macro definition
     if (/\bMACRO\b/) {
+        # Process next line down, which will be the macro definition
         $_ = <STDIN>;
         s/^/.macro/;
-        s/\$//g;                # remove formal param reference
-        s/;/@/g;                # change comment characters
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    # For macros, use \ to reference formal params
-    s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
+
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
+    s/[ \t]+$//;
     print;
-    print "$comment_sub$comment\n" if defined $comment;
 }
 
 # Mark that this object doesn't need an executable stack.
-printf ("\t.section\t.note.GNU-stack,\"\",\%\%progbits\n") if $elf;
+printf ("    .section .note.GNU-stack,\"\",\%\%progbits\n") if $elf;
diff --git a/libvpx/build/make/ads2gas_apple.pl b/libvpx/build/make/ads2gas_apple.pl
index 848872f..62491c1 100755
--- a/libvpx/build/make/ads2gas_apple.pl
+++ b/libvpx/build/make/ads2gas_apple.pl
@@ -20,17 +20,14 @@
 
 print "@ This file was created from a .asm file\n";
 print "@  using the ads2gas_apple.pl script.\n\n";
-print "\t.syntax unified\n";
+print ".syntax unified\n";
 
-my %register_aliases;
 my %macro_aliases;
 
 my @mapping_list = ("\$0", "\$1", "\$2", "\$3", "\$4", "\$5", "\$6", "\$7", "\$8", "\$9");
 
 my @incoming_array;
 
-my @imported_functions;
-
 # Perl trim function to remove whitespace from the start and end of the string
 sub trim($)
 {
@@ -46,25 +43,7 @@
     s/@/,:/g;
 
     # Comment character
-    s/;/ @/g;
-
-    # Hexadecimal constants prefaced by 0x
-    s/#&/#0x/g;
-
-    # Convert :OR: to |
-    s/:OR:/ | /g;
-
-    # Convert :AND: to &
-    s/:AND:/ & /g;
-
-    # Convert :NOT: to ~
-    s/:NOT:/ ~ /g;
-
-    # Convert :SHL: to <<
-    s/:SHL:/ << /g;
-
-    # Convert :SHR: to >>
-    s/:SHR:/ >> /g;
+    s/;/@/;
 
     # Convert ELSE to .else
     s/\bELSE\b/.else/g;
@@ -72,131 +51,64 @@
     # Convert ENDIF to .endif
     s/\bENDIF\b/.endif/g;
 
-    # Convert ELSEIF to .elseif
-    s/\bELSEIF\b/.elseif/g;
-
-    # Convert LTORG to .ltorg
-    s/\bLTORG\b/.ltorg/g;
-
-    # Convert IF :DEF:to .if
-    # gcc doesn't have the ability to do a conditional
-    # if defined variable that is set by IF :DEF: on
-    # armasm, so convert it to a normal .if and then
-    # make sure to define a value elesewhere
-    if (s/\bIF :DEF:\b/.if /g)
-    {
-        s/=/==/g;
-    }
-
     # Convert IF to .if
-    if (s/\bIF\b/.if/g)
-    {
-        s/=/==/g;
+    if (s/\bIF\b/.if/g) {
+        s/=+/==/g;
     }
 
     # Convert INCLUDE to .INCLUDE "file"
-    s/INCLUDE(\s*)(.*)$/.include $1\"$2\"/;
-
-    # Code directive (ARM vs Thumb)
-    s/CODE([0-9][0-9])/.code $1/;
+    s/INCLUDE\s?(.*)$/.include \"$1\"/;
 
     # No AREA required
     # But ALIGNs in AREA must be obeyed
-    s/^\s*AREA.*ALIGN=([0-9])$/.text\n.p2align $1/;
+    s/^(\s*)\bAREA\b.*ALIGN=([0-9])$/$1.text\n$1.p2align $2/;
     # If no ALIGN, strip the AREA and align to 4 bytes
-    s/^\s*AREA.*$/.text\n.p2align 2/;
+    s/^(\s*)\bAREA\b.*$/$1.text\n$1.p2align 2/;
 
-    # DCD to .word
-    # This one is for incoming symbols
-    s/DCD\s+\|(\w*)\|/.long $1/;
+    # Make function visible to linker.
+    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1/;
 
-    # DCW to .short
-    s/DCW\s+\|(\w*)\|/.short $1/;
-    s/DCW(.*)/.short $1/;
+    # No vertical bars on function names
+    s/^\|(\$?\w+)\|/$1/g;
 
-    # Constants defined in scope
-    s/DCD(.*)/.long $1/;
-    s/DCB(.*)/.byte $1/;
+    # Labels and functions need a leading underscore and trailing colon
+    s/^([a-zA-Z_0-9\$]+)/_$1:/ if !/EQU/;
 
-    # Make function visible to linker, and make additional symbol with
-    # prepended underscore
-    s/EXPORT\s+\|([\$\w]*)\|/.globl _$1\n\t.globl $1/;
-
-    # Prepend imported functions with _
-    if (s/IMPORT\s+\|([\$\w]*)\|/.globl $1/)
-    {
-        $function = trim($1);
-        push(@imported_functions, $function);
-    }
-
-    foreach $function (@imported_functions)
-    {
-        s/$function/_$function/;
-    }
-
-    # No vertical bars required; make additional symbol with prepended
-    # underscore
-    s/^\|(\$?\w+)\|/_$1\n\t$1:/g;
-
-    # Labels need trailing colon
-#   s/^(\w+)/$1:/ if !/EQU/;
-    # put the colon at the end of the line in the macro
-    s/^([a-zA-Z_0-9\$]+)/$1:/ if !/EQU/;
+    # Branches need to call the correct, underscored, function
+    s/^(\s+b[egln]?[teq]?\s+)([a-zA-Z_0-9\$]+)/$1 _$2/ if !/EQU/;
 
     # ALIGN directive
     s/\bALIGN\b/.balign/g;
 
     # Strip ARM
-    s/\sARM/@ ARM/g;
+    s/\s+ARM//;
 
     # Strip REQUIRE8
-    #s/\sREQUIRE8/@ REQUIRE8/g;
-    s/\sREQUIRE8/@ /g;
+    s/\s+REQUIRE8//;
 
     # Strip PRESERVE8
-    s/\sPRESERVE8/@ PRESERVE8/g;
+    s/\s+PRESERVE8//;
 
     # Strip PROC and ENDPROC
-    s/\bPROC\b/@/g;
-    s/\bENDP\b/@/g;
+    s/\bPROC\b//g;
+    s/\bENDP\b//g;
 
     # EQU directive
-    s/(.*)EQU(.*)/.set $1, $2/;
+    s/(\S+\s+)EQU(\s+\S+)/.equ $1, $2/;
 
     # Begin macro definition
-    if (/\bMACRO\b/)
-    {
+    if (/\bMACRO\b/) {
         # Process next line down, which will be the macro definition
         $_ = <STDIN>;
-
-        $trimmed = trim($_);
-
-        # remove commas that are separating list
-        $trimmed =~ s/,//g;
-
-        # string to array
-        @incoming_array = split(/\s+/, $trimmed);
-
-        print ".macro @incoming_array[0]\n";
-
-        # remove the first element, as that is the name of the macro
-        shift (@incoming_array);
-
-        @macro_aliases{@incoming_array} = @mapping_list;
-
-        next;
+        s/^/.macro/;
+        s/\$//g;             # Remove $ from the variables in the declaration
     }
 
-    while (($key, $value) = each(%macro_aliases))
-    {
-        $key =~ s/\$/\\\$/;
-        s/$key\b/$value/g;
-    }
+    s/\$/\\/g;               # Use \ to reference formal parameters
+    # End macro definition
 
-    # For macros, use \ to reference formal params
-#   s/\$/\\/g;                  # End macro definition
-    s/\bMEND\b/.endm/;              # No need to tell it where to stop assembling
+    s/\bMEND\b/.endm/;       # No need to tell it where to stop assembling
     next if /^\s*END\s*$/;
-
+    s/[ \t]+$//;
     print;
 }
diff --git a/libvpx/build/make/configure.sh b/libvpx/build/make/configure.sh
index 81d30a1..581042e 100755
--- a/libvpx/build/make/configure.sh
+++ b/libvpx/build/make/configure.sh
@@ -449,6 +449,17 @@
   fi
 }
 
+check_inline_asm() {
+  log check_inline_asm "$@"
+  name="$1"
+  code="$2"
+  shift 2
+  disable_feature $name
+  check_cc "$@" <<EOF && enable_feature $name
+void foo(void) { __asm__ volatile($code); }
+EOF
+}
+
 write_common_config_banner() {
   print_webm_license config.mk "##" ""
   echo '# This file automatically generated by configure. Do not edit!' >> config.mk
@@ -766,6 +777,12 @@
       *mips32el*)
         tgt_isa=mips32
         ;;
+      loongarch32*)
+        tgt_isa=loongarch32
+        ;;
+      loongarch64*)
+        tgt_isa=loongarch64
+        ;;
     esac
 
     # detect tgt_os
@@ -774,7 +791,7 @@
         tgt_isa=x86_64
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin1[0-9]\).*/\1/'`
         ;;
-      *darwin20*)
+      *darwin2[0-1]*)
         tgt_isa=`uname -m`
         tgt_os=`echo $gcctarget | sed 's/.*\(darwin2[0-9]\).*/\1/'`
         ;;
@@ -834,6 +851,11 @@
     ppc*)
       enable_feature ppc
       ;;
+    loongarch*)
+      soft_enable lsx
+      soft_enable lasx
+      enable_feature loongarch
+      ;;
   esac
 
   # PIC is probably what we want when building shared libs
@@ -918,9 +940,9 @@
       add_cflags  "-mmacosx-version-min=10.15"
       add_ldflags "-mmacosx-version-min=10.15"
       ;;
-    *-darwin20-*)
-      add_cflags  "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
-      add_ldflags "-mmacosx-version-min=10.16 -arch ${toolchain%%-*}"
+    *-darwin2[0-1]-*)
+      add_cflags  "-arch ${toolchain%%-*}"
+      add_ldflags "-arch ${toolchain%%-*}"
       ;;
     *-iphonesimulator-*)
       add_cflags  "-miphoneos-version-min=${IOS_VERSION_MIN}"
@@ -1419,6 +1441,15 @@
           ;;
       esac
       ;;
+    loongarch*)
+      link_with_cc=gcc
+      setup_gnu_toolchain
+
+      enabled lsx && check_inline_asm lsx '"vadd.b $vr0, $vr1, $vr1"'
+      enabled lsx && soft_enable runtime_cpu_detect
+      enabled lasx && check_inline_asm lasx '"xvadd.b $xr0, $xr1, $xr1"'
+      enabled lasx && soft_enable runtime_cpu_detect
+      ;;
     *-gcc|generic-gnu)
       link_with_cc=gcc
       enable_feature gcc
@@ -1521,6 +1552,22 @@
       ;;
   esac
 
+  # only for LOONGARCH platforms
+  case ${toolchain} in
+    loongarch*)
+      if enabled big_endian; then
+        if enabled lsx; then
+          echo "lsx optimizations are available only for little endian platforms"
+          disable_feature lsx
+        fi
+        if enabled lasx; then
+          echo "lasx optimizations are available only for little endian platforms"
+          disable_feature lasx
+        fi
+      fi
+      ;;
+  esac
+
   # glibc needs these
   if enabled linux; then
     add_cflags -D_LARGEFILE_SOURCE
diff --git a/libvpx/build/make/gen_msvs_sln.sh b/libvpx/build/make/gen_msvs_sln.sh
index d1adfd7..0b31285 100755
--- a/libvpx/build/make/gen_msvs_sln.sh
+++ b/libvpx/build/make/gen_msvs_sln.sh
@@ -25,7 +25,7 @@
 Options:
     --help                      Print this message
     --out=outfile               Redirect output to a file
-    --ver=version               Version (14-16) of visual studio to generate for
+    --ver=version               Version (14-17) of visual studio to generate for
     --target=isa-os-cc          Target specifier
 EOF
     exit 1
@@ -219,6 +219,7 @@
         14) vs_year=2015 ;;
         15) vs_year=2017 ;;
         16) vs_year=2019 ;;
+        17) vs_year=2022 ;;
         *) die Unrecognized Visual Studio Version in $opt ;;
       esac
     ;;
@@ -232,7 +233,7 @@
 outfile=${outfile:-/dev/stdout}
 mkoutfile=${mkoutfile:-/dev/stdout}
 case "${vs_ver}" in
-    1[4-6])
+    1[4-7])
       # VS has used Format Version 12.00 continuously since vs11.
       sln_vers="12.00"
       sln_vers_str="Visual Studio ${vs_year}"
diff --git a/libvpx/build/make/gen_msvs_vcxproj.sh b/libvpx/build/make/gen_msvs_vcxproj.sh
index 6f91ad4..58bb66b 100755
--- a/libvpx/build/make/gen_msvs_vcxproj.sh
+++ b/libvpx/build/make/gen_msvs_vcxproj.sh
@@ -170,7 +170,7 @@
         --ver=*)
             vs_ver="$optval"
             case "$optval" in
-                1[4-6])
+                1[4-7])
                 ;;
                 *) die Unrecognized Visual Studio Version in $opt
                 ;;
@@ -344,6 +344,9 @@
             if [ "$vs_ver" = "16" ]; then
                 tag_content PlatformToolset v142
             fi
+            if [ "$vs_ver" = "17" ]; then
+                tag_content PlatformToolset v143
+            fi
             tag_content CharacterSet Unicode
             if [ "$config" = "Release" ]; then
                 tag_content WholeProgramOptimization true
diff --git a/libvpx/build/make/rtcd.pl b/libvpx/build/make/rtcd.pl
index acb9f6e..9c97268 100755
--- a/libvpx/build/make/rtcd.pl
+++ b/libvpx/build/make/rtcd.pl
@@ -387,6 +387,37 @@
   common_bottom;
 }
 
+sub loongarch() {
+  determine_indirection("c", @ALL_ARCHS);
+
+  # Assign the helper variable for each enabled extension
+  foreach my $opt (@ALL_ARCHS) {
+    my $opt_uc = uc $opt;
+    eval "\$have_${opt}=\"flags & HAS_${opt_uc}\"";
+  }
+
+  common_top;
+  print <<EOF;
+#include "vpx_config.h"
+
+#ifdef RTCD_C
+#include "vpx_ports/loongarch.h"
+static void setup_rtcd_internal(void)
+{
+    int flags = loongarch_cpu_caps();
+
+    (void)flags;
+EOF
+
+  set_function_pointers("c", @ALL_ARCHS);
+
+  print <<EOF;
+}
+#endif
+EOF
+  common_bottom;
+}
+
 sub unoptimized() {
   determine_indirection "c";
   common_top;
@@ -462,6 +493,9 @@
 } elsif ($opts{arch} =~ /^ppc/ ) {
   @ALL_ARCHS = filter(qw/vsx/);
   ppc;
+} elsif ($opts{arch} =~ /loongarch/ ) {
+  @ALL_ARCHS = filter(qw/lsx lasx/);
+  loongarch;
 } else {
   unoptimized;
 }
diff --git a/libvpx/build/make/thumb.pm b/libvpx/build/make/thumb.pm
index 9c49e2d..ef4b316 100644
--- a/libvpx/build/make/thumb.pm
+++ b/libvpx/build/make/thumb.pm
@@ -11,11 +11,8 @@
 
 package thumb;
 
-sub FixThumbInstructions($$)
+sub FixThumbInstructions($)
 {
-    my $short_branches = $_[1];
-    my $branch_shift_offset = $short_branches ? 1 : 0;
-
     # Write additions with shifts, such as "add r10, r11, lsl #8",
     # in three operand form, "add r10, r10, r11, lsl #8".
     s/(add\s+)(r\d+),\s*(r\d+),\s*(lsl #\d+)/$1$2, $2, $3, $4/g;
diff --git a/libvpx/configure b/libvpx/configure
index da631a4..beea650 100755
--- a/libvpx/configure
+++ b/libvpx/configure
@@ -100,6 +100,7 @@
 all_platforms="${all_platforms} arm64-android-gcc"
 all_platforms="${all_platforms} arm64-darwin-gcc"
 all_platforms="${all_platforms} arm64-darwin20-gcc"
+all_platforms="${all_platforms} arm64-darwin21-gcc"
 all_platforms="${all_platforms} arm64-linux-gcc"
 all_platforms="${all_platforms} arm64-win64-gcc"
 all_platforms="${all_platforms} arm64-win64-vs15"
@@ -113,6 +114,8 @@
 all_platforms="${all_platforms} armv7-win32-vs15"
 all_platforms="${all_platforms} armv7s-darwin-gcc"
 all_platforms="${all_platforms} armv8-linux-gcc"
+all_platforms="${all_platforms} loongarch32-linux-gcc"
+all_platforms="${all_platforms} loongarch64-linux-gcc"
 all_platforms="${all_platforms} mips32-linux-gcc"
 all_platforms="${all_platforms} mips64-linux-gcc"
 all_platforms="${all_platforms} ppc64le-linux-gcc"
@@ -139,6 +142,7 @@
 all_platforms="${all_platforms} x86-win32-vs14"
 all_platforms="${all_platforms} x86-win32-vs15"
 all_platforms="${all_platforms} x86-win32-vs16"
+all_platforms="${all_platforms} x86-win32-vs17"
 all_platforms="${all_platforms} x86_64-android-gcc"
 all_platforms="${all_platforms} x86_64-darwin9-gcc"
 all_platforms="${all_platforms} x86_64-darwin10-gcc"
@@ -152,6 +156,7 @@
 all_platforms="${all_platforms} x86_64-darwin18-gcc"
 all_platforms="${all_platforms} x86_64-darwin19-gcc"
 all_platforms="${all_platforms} x86_64-darwin20-gcc"
+all_platforms="${all_platforms} x86_64-darwin21-gcc"
 all_platforms="${all_platforms} x86_64-iphonesimulator-gcc"
 all_platforms="${all_platforms} x86_64-linux-gcc"
 all_platforms="${all_platforms} x86_64-linux-icc"
@@ -160,6 +165,7 @@
 all_platforms="${all_platforms} x86_64-win64-vs14"
 all_platforms="${all_platforms} x86_64-win64-vs15"
 all_platforms="${all_platforms} x86_64-win64-vs16"
+all_platforms="${all_platforms} x86_64-win64-vs17"
 all_platforms="${all_platforms} generic-gnu"
 
 # all_targets is a list of all targets that can be configured
@@ -235,6 +241,7 @@
     x86
     x86_64
     ppc
+    loongarch
 "
 ARCH_EXT_LIST_X86="
     mmx
@@ -250,6 +257,8 @@
 
 ARCH_EXT_LIST_LOONGSON="
     mmi
+    lsx
+    lasx
 "
 
 ARCH_EXT_LIST="
@@ -620,7 +629,10 @@
         check_add_cflags -Wall
         check_add_cflags -Wdeclaration-after-statement
         check_add_cflags -Wdisabled-optimization
+        check_add_cflags -Wextra-semi
+        check_add_cflags -Wextra-semi-stmt
         check_add_cflags -Wfloat-conversion
+        check_add_cflags -Wformat=2
         check_add_cflags -Wparentheses-equality
         check_add_cflags -Wpointer-arith
         check_add_cflags -Wtype-limits
diff --git a/libvpx/examples/postproc.c b/libvpx/examples/postproc.c
index be999b4..b53c15e 100644
--- a/libvpx/examples/postproc.c
+++ b/libvpx/examples/postproc.c
@@ -109,7 +109,7 @@
                                 0 };
       if (vpx_codec_control(&codec, VP8_SET_POSTPROC, &pp))
         die_codec(&codec, "Failed to turn on postproc.");
-    };
+    }
 
     // Decode the frame with 15ms deadline
     if (vpx_codec_decode(&codec, frame, (unsigned int)frame_size, NULL, 15000))
diff --git a/libvpx/examples/svc_encodeframe.c b/libvpx/examples/svc_encodeframe.c
index a73ee8e..08bda0e 100644
--- a/libvpx/examples/svc_encodeframe.c
+++ b/libvpx/examples/svc_encodeframe.c
@@ -21,6 +21,7 @@
 #include <stdlib.h>
 #include <string.h>
 #define VPX_DISABLE_CTRL_TYPECHECKS 1
+#include "../tools_common.h"
 #include "./vpx_config.h"
 #include "./svc_context.h"
 #include "vpx/vp8cx.h"
@@ -95,8 +96,9 @@
   return (const SvcInternal_t *)svc_ctx->internal;
 }
 
-static int svc_log(SvcContext *svc_ctx, SVC_LOG_LEVEL level, const char *fmt,
-                   ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) int svc_log(SvcContext *svc_ctx,
+                                                 SVC_LOG_LEVEL level,
+                                                 const char *fmt, ...) {
   char buf[512];
   int retval = 0;
   va_list ap;
@@ -264,7 +266,7 @@
   if (alt_ref_enabled > REF_FRAMES - svc_ctx->spatial_layers) {
     svc_log(svc_ctx, SVC_LOG_ERROR,
             "svc: auto alt ref: Maxinum %d(REF_FRAMES - layers) layers could"
-            "enabled auto alt reference frame, but % layers are enabled\n",
+            "enabled auto alt reference frame, but %d layers are enabled\n",
             REF_FRAMES - svc_ctx->spatial_layers, alt_ref_enabled);
     res = VPX_CODEC_INVALID_PARAM;
   }
@@ -456,10 +458,11 @@
     svc_ctx->temporal_layers = VPX_TS_MAX_LAYERS;
 
   if (svc_ctx->temporal_layers * svc_ctx->spatial_layers > VPX_MAX_LAYERS) {
-    svc_log(svc_ctx, SVC_LOG_ERROR,
-            "spatial layers * temporal layers exceeds the maximum number of "
-            "allowed layers of %d\n",
-            svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
+    svc_log(
+        svc_ctx, SVC_LOG_ERROR,
+        "spatial layers * temporal layers (%d) exceeds the maximum number of "
+        "allowed layers of %d\n",
+        svc_ctx->spatial_layers * svc_ctx->temporal_layers, VPX_MAX_LAYERS);
     return VPX_CODEC_INVALID_PARAM;
   }
   res = assign_layer_bitrates(svc_ctx, enc_cfg);
diff --git a/libvpx/examples/twopass_encoder.c b/libvpx/examples/twopass_encoder.c
index 3d950b2..07a10d9 100644
--- a/libvpx/examples/twopass_encoder.c
+++ b/libvpx/examples/twopass_encoder.c
@@ -84,6 +84,7 @@
       const uint8_t *const pkt_buf = pkt->data.twopass_stats.buf;
       const size_t pkt_size = pkt->data.twopass_stats.sz;
       stats->buf = realloc(stats->buf, stats->sz + pkt_size);
+      if (!stats->buf) die("Failed to reallocate stats buffer.");
       memcpy((uint8_t *)stats->buf + stats->sz, pkt_buf, pkt_size);
       stats->sz += pkt_size;
     }
@@ -221,7 +222,7 @@
     die("Invalid frame size: %dx%d", w, h);
 
   if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, w, h, 1))
-    die("Failed to allocate image", w, h);
+    die("Failed to allocate image (%dx%d)", w, h);
 
   printf("Using %s\n", vpx_codec_iface_name(encoder->codec_interface()));
 
diff --git a/libvpx/examples/vp8_multi_resolution_encoder.c b/libvpx/examples/vp8_multi_resolution_encoder.c
index e72f8a0..62d96de 100644
--- a/libvpx/examples/vp8_multi_resolution_encoder.c
+++ b/libvpx/examples/vp8_multi_resolution_encoder.c
@@ -352,7 +352,7 @@
   framerate = (int)strtol(argv[3], NULL, 0);
 
   if (width < 16 || width % 2 || height < 16 || height % 2)
-    die("Invalid resolution: %ldx%ld", width, height);
+    die("Invalid resolution: %dx%d", width, height);
 
   /* Open input video file for encoding */
   if (!(infile = fopen(argv[4], "rb")))
@@ -380,7 +380,7 @@
         (int)strtol(argv[2 * NUM_ENCODERS + 5 + i], NULL, 0);
     if (num_temporal_layers[i] < 1 || num_temporal_layers[i] > 3)
       die("Invalid temporal layers: %d, Must be 1, 2, or 3. \n",
-          num_temporal_layers);
+          num_temporal_layers[i]);
   }
 
   /* Open file to write out each spatially downsampled input stream. */
@@ -468,7 +468,7 @@
   /* Allocate image for each encoder */
   for (i = 0; i < NUM_ENCODERS; i++)
     if (!vpx_img_alloc(&raw[i], VPX_IMG_FMT_I420, cfg[i].g_w, cfg[i].g_h, 32))
-      die("Failed to allocate image", cfg[i].g_w, cfg[i].g_h);
+      die("Failed to allocate image (%dx%d)", cfg[i].g_w, cfg[i].g_h);
 
   if (raw[0].stride[VPX_PLANE_Y] == (int)raw[0].d_w)
     read_frame_p = mulres_read_frame;
diff --git a/libvpx/examples/vp9_spatial_svc_encoder.c b/libvpx/examples/vp9_spatial_svc_encoder.c
index c37e608..e85dbf8 100644
--- a/libvpx/examples/vp9_spatial_svc_encoder.c
+++ b/libvpx/examples/vp9_spatial_svc_encoder.c
@@ -222,6 +222,10 @@
 
   // process command line options
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    exit(EXIT_FAILURE);
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     arg.argv_step = 1;
 
@@ -357,6 +361,8 @@
   if (app_input->input_ctx.file_type == FILE_TYPE_Y4M) {
     enc_cfg->g_w = app_input->input_ctx.width;
     enc_cfg->g_h = app_input->input_ctx.height;
+    enc_cfg->g_timebase.den = app_input->input_ctx.framerate.numerator;
+    enc_cfg->g_timebase.num = app_input->input_ctx.framerate.denominator;
   }
 
   if (enc_cfg->g_w < 16 || enc_cfg->g_w % 2 || enc_cfg->g_h < 16 ||
@@ -579,7 +585,8 @@
       ref_frame_config->alt_fb_idx[sl] = 0;
     } else if (tl == 1) {
       ref_frame_config->lst_fb_idx[sl] = sl;
-      ref_frame_config->gld_fb_idx[sl] = num_spatial_layers + sl - 1;
+      ref_frame_config->gld_fb_idx[sl] =
+          (sl == 0) ? 0 : num_spatial_layers + sl - 1;
       ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
     }
     // Set the reference and update flags.
diff --git a/libvpx/examples/vpx_temporal_svc_encoder.c b/libvpx/examples/vpx_temporal_svc_encoder.c
index ad3e79c..a800278 100644
--- a/libvpx/examples/vpx_temporal_svc_encoder.c
+++ b/libvpx/examples/vpx_temporal_svc_encoder.c
@@ -30,7 +30,7 @@
 
 #define ROI_MAP 0
 
-#define zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
 static const char *exec_name;
 
@@ -240,6 +240,38 @@
     }
   }
 }
+
+static void set_roi_skip_map(vpx_codec_enc_cfg_t *cfg, vpx_roi_map_t *roi,
+                             int *skip_map, int *prev_mask_map, int frame_num) {
+  const int block_size = 8;
+  unsigned int i, j;
+  roi->rows = (cfg->g_h + block_size - 1) / block_size;
+  roi->cols = (cfg->g_w + block_size - 1) / block_size;
+  zero(roi->skip);
+  zero(roi->delta_q);
+  zero(roi->delta_lf);
+  memset(roi->ref_frame, -1, sizeof(roi->ref_frame));
+  roi->ref_frame[1] = 1;
+  // Use segment 3 for skip.
+  roi->skip[3] = 1;
+  roi->roi_map =
+      (uint8_t *)calloc(roi->rows * roi->cols, sizeof(*roi->roi_map));
+  for (i = 0; i < roi->rows; ++i) {
+    for (j = 0; j < roi->cols; ++j) {
+      const int idx = i * roi->cols + j;
+      // Use segment 3 for skip.
+      // prev_mask_map keeps track of blocks that have been stably on segment 3
+      // for the past 10 frames. Only skip when the block is on segment 3 in
+      // both current map and prev_mask_map.
+      if (skip_map[idx] == 1 && prev_mask_map[idx] == 1) roi->roi_map[idx] = 3;
+      // Reset it every 10 frames so it doesn't propagate for too many frames.
+      if (frame_num % 10 == 0)
+        prev_mask_map[idx] = skip_map[idx];
+      else if (prev_mask_map[idx] == 1 && skip_map[idx] == 0)
+        prev_mask_map[idx] = 0;
+    }
+  }
+}
 #endif
 
 // Temporal scaling parameters:
@@ -574,6 +606,23 @@
   }
 }
 
+#if ROI_MAP
+static void read_mask(FILE *mask_file, int *seg_map) {
+  int mask_rows, mask_cols, i, j;
+  int *map_start = seg_map;
+  fscanf(mask_file, "%d %d\n", &mask_cols, &mask_rows);
+  for (i = 0; i < mask_rows; i++) {
+    for (j = 0; j < mask_cols; j++) {
+      fscanf(mask_file, "%d ", &seg_map[j]);
+      // reverse the bit
+      seg_map[j] = 1 - seg_map[j];
+    }
+    seg_map += mask_cols;
+  }
+  seg_map = map_start;
+}
+#endif
+
 int main(int argc, char **argv) {
   VpxVideoWriter *outfile[VPX_TS_MAX_LAYERS] = { NULL };
   vpx_codec_ctx_t codec;
@@ -613,7 +662,14 @@
   double sum_bitrate = 0.0;
   double sum_bitrate2 = 0.0;
   double framerate = 30.0;
-
+#if ROI_MAP
+  FILE *mask_file = NULL;
+  int block_size = 8;
+  int mask_rows = 0;
+  int mask_cols = 0;
+  int *mask_map;
+  int *prev_mask_map;
+#endif
   zero(rc.layer_target_bitrate);
   memset(&layer_id, 0, sizeof(vpx_svc_layer_id_t));
   memset(&input_ctx, 0, sizeof(input_ctx));
@@ -657,9 +713,15 @@
     die("Invalid layering mode (0..12) %s", argv[12]);
   }
 
+#if ROI_MAP
+  if (argc != min_args + mode_to_num_layers[layering_mode] + 1) {
+    die("Invalid number of arguments");
+  }
+#else
   if (argc != min_args + mode_to_num_layers[layering_mode]) {
     die("Invalid number of arguments");
   }
+#endif
 
   input_ctx.filename = argv[1];
   open_input_file(&input_ctx);
@@ -687,14 +749,14 @@
             &raw,
             bit_depth == VPX_BITS_8 ? VPX_IMG_FMT_I420 : VPX_IMG_FMT_I42016,
             width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #else
   // Y4M reader has its own allocation.
   if (input_ctx.file_type != FILE_TYPE_Y4M) {
     if (!vpx_img_alloc(&raw, VPX_IMG_FMT_I420, width, height, 32)) {
-      die("Failed to allocate image", width, height);
+      die("Failed to allocate image (%dx%d)", width, height);
     }
   }
 #endif  // CONFIG_VP9_HIGHBITDEPTH
@@ -817,6 +879,13 @@
 #endif  // CONFIG_VP9_HIGHBITDEPTH
     die("Failed to initialize encoder");
 
+#if ROI_MAP
+  mask_rows = (cfg.g_h + block_size - 1) / block_size;
+  mask_cols = (cfg.g_w + block_size - 1) / block_size;
+  mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+  prev_mask_map = (int *)calloc(mask_rows * mask_cols, sizeof(*mask_map));
+#endif
+
   if (strncmp(encoder->name, "vp8", 3) == 0) {
     vpx_codec_control(&codec, VP8E_SET_CPUUSED, -speed);
     vpx_codec_control(&codec, VP8E_SET_NOISE_SENSITIVITY, kVp8DenoiserOff);
@@ -827,7 +896,6 @@
     if (vpx_codec_control(&codec, VP8E_SET_ROI_MAP, &roi))
       die_codec(&codec, "Failed to set ROI map");
 #endif
-
   } else if (strncmp(encoder->name, "vp9", 3) == 0) {
     vpx_svc_extra_cfg_t svc_params;
     memset(&svc_params, 0, sizeof(svc_params));
@@ -843,12 +911,7 @@
     vpx_codec_control(&codec, VP9E_SET_TUNE_CONTENT, 0);
     vpx_codec_control(&codec, VP9E_SET_TILE_COLUMNS, get_msb(cfg.g_threads));
     vpx_codec_control(&codec, VP9E_SET_DISABLE_LOOPFILTER, 0);
-#if ROI_MAP
-    set_roi_map(encoder->name, &cfg, &roi);
-    if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
-      die_codec(&codec, "Failed to set ROI map");
-    vpx_codec_control(&codec, VP9E_SET_AQ_MODE, 0);
-#endif
+
     if (cfg.g_threads > 1)
       vpx_codec_control(&codec, VP9E_SET_ROW_MT, 1);
     else
@@ -881,6 +944,9 @@
     struct vpx_usec_timer timer;
     vpx_codec_iter_t iter = NULL;
     const vpx_codec_cx_pkt_t *pkt;
+#if ROI_MAP
+    char mask_file_name[255];
+#endif
     // Update the temporal layer_id. No spatial layers in this test.
     layer_id.spatial_layer_id = 0;
     layer_id.temporal_layer_id =
@@ -894,6 +960,19 @@
     }
     flags = layer_flags[frame_cnt % flag_periodicity];
     if (layering_mode == 0) flags = 0;
+#if ROI_MAP
+    snprintf(mask_file_name, sizeof(mask_file_name), "%s%05d.txt",
+             argv[argc - 1], frame_cnt);
+    mask_file = fopen(mask_file_name, "r");
+    if (mask_file != NULL) {
+      read_mask(mask_file, mask_map);
+      fclose(mask_file);
+      // set_roi_map(encoder->name, &cfg, &roi);
+      set_roi_skip_map(&cfg, &roi, mask_map, prev_mask_map, frame_cnt);
+      if (vpx_codec_control(&codec, VP9E_SET_ROI_MAP, &roi))
+        die_codec(&codec, "Failed to set ROI map");
+    }
+#endif
     frame_avail = read_frame(&input_ctx, &raw);
     if (frame_avail) ++rc.layer_input_frames[layer_id.temporal_layer_id];
     vpx_usec_timer_start(&timer);
@@ -963,6 +1042,10 @@
     ++frame_cnt;
     pts += frame_duration;
   }
+#if ROI_MAP
+  free(mask_map);
+  free(prev_mask_map);
+#endif
   close_input_file(&input_ctx);
   printout_rate_control_summary(&rc, &cfg, frame_cnt);
   printf("\n");
diff --git a/libvpx/libs.mk b/libvpx/libs.mk
index 7cd973b..00e49a1 100644
--- a/libvpx/libs.mk
+++ b/libvpx/libs.mk
@@ -94,15 +94,28 @@
   INSTALL_MAPS += include/vpx/% $(SRC_PATH_BARE)/$(VP9_PREFIX)/%
   CODEC_DOC_SRCS += vpx/vp8.h vpx/vp8cx.h vpx/vpx_ext_ratectrl.h
   CODEC_DOC_SECTIONS += vp9 vp9_encoder
+endif
 
-  RC_RTC_SRCS := $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
-  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk vpx/vp8.h vpx/vp8cx.h
-  RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS := vpx/vp8.h vpx/vp8cx.h
+RC_RTC_SRCS += vpx/vpx_ext_ratectrl.h
+RC_RTC_SRCS += vpx/internal/vpx_ratectrl_rtc.h
+ifeq ($(CONFIG_VP9_ENCODER),yes)
+  VP9_PREFIX=vp9/
+  RC_RTC_SRCS += $(addprefix $(VP9_PREFIX),$(call enabled,VP9_CX_SRCS))
+  RC_RTC_SRCS += $(VP9_PREFIX)vp9cx.mk
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.cc
   RC_RTC_SRCS += $(VP9_PREFIX)ratectrl_rtc.h
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.cc
   INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP9_PREFIX)ratectrl_rtc.h
 endif
+ifeq ($(CONFIG_VP8_ENCODER),yes)
+  VP8_PREFIX=vp8/
+  RC_RTC_SRCS += $(addprefix $(VP8_PREFIX),$(call enabled,VP8_CX_SRCS))
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  RC_RTC_SRCS += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.cc
+  INSTALL-SRCS-$(CONFIG_CODEC_SRCS) += $(VP8_PREFIX)vp8_ratectrl_rtc.h
+endif
 
 ifeq ($(CONFIG_VP9_DECODER),yes)
   VP9_PREFIX=vp9/
@@ -126,7 +139,7 @@
 ifeq ($(CONFIG_MSVS),yes)
 CODEC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxmt,vpxmd)
 GTEST_LIB=$(if $(CONFIG_STATIC_MSVCRT),gtestmt,gtestmd)
-RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vp9rcmt,vp9rcmd)
+RC_RTC_LIB=$(if $(CONFIG_STATIC_MSVCRT),vpxrcmt,vpxrcmd)
 # This variable uses deferred expansion intentionally, since the results of
 # $(wildcard) may change during the course of the Make.
 VS_PLATFORMS = $(foreach d,$(wildcard */Release/$(CODEC_LIB).lib),$(word 1,$(subst /, ,$(d))))
@@ -249,16 +262,16 @@
 vpx.$(VCPROJ_SFX): vpx_config.asm
 vpx.$(VCPROJ_SFX): $(RTCD)
 
-vp9rc.$(VCPROJ_SFX): \
+vpxrc.$(VCPROJ_SFX): \
     VCPROJ_SRCS=$(filter-out $(addprefix %, $(ASM_INCLUDES)), $^)
 
-vp9rc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
+vpxrc.$(VCPROJ_SFX): $(RC_RTC_SRCS)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             $(if $(CONFIG_SHARED),--dll,--lib) \
             --target=$(TOOLCHAIN) \
             $(if $(CONFIG_STATIC_MSVCRT),--static-crt) \
-            --name=vp9rc \
+            --name=vpxrc \
             --proj-guid=C26FF952-9494-4838-9A3F-7F3D4F613385 \
             --ver=$(CONFIG_VS_VERSION) \
             --src-path-bare="$(SRC_PATH_BARE)" \
@@ -275,10 +288,10 @@
               $(VCPROJ_SRCS)) \
             --src-path-bare="$(SRC_PATH_BARE)" \
 
-PROJECTS-yes += vp9rc.$(VCPROJ_SFX)
+PROJECTS-yes += vpxrc.$(VCPROJ_SFX)
 
-vp9rc.$(VCPROJ_SFX): vpx_config.asm
-vp9rc.$(VCPROJ_SFX): $(RTCD)
+vpxrc.$(VCPROJ_SFX): vpx_config.asm
+vpxrc.$(VCPROJ_SFX): $(RTCD)
 
 endif # ifeq ($(CONFIG_MSVS),yes)
 else # ifeq ($(CONFIG_EXTERNAL_BUILD),yes)
@@ -300,7 +313,7 @@
 # SO_VERSION_* then follow the rules in the link to detemine the new version
 # (c1, a1, r1) and set MAJOR to [c1-a1], MINOR to a1 and PATCH to r1
 SO_VERSION_MAJOR := 7
-SO_VERSION_MINOR := 0
+SO_VERSION_MINOR := 1
 SO_VERSION_PATCH := 0
 ifeq ($(filter darwin%,$(TGT_OS)),$(TGT_OS))
 LIBVPX_SO               := libvpx.$(SO_VERSION_MAJOR).dylib
@@ -398,12 +411,11 @@
 INSTALL_MAPS += $(LIBSUBDIR)/pkgconfig/%.pc %.pc
 CLEAN-OBJS += vpx.pc
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
-  RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
+ifeq ($(CONFIG_ENCODERS),yes)
   RC_RTC_OBJS=$(call objs,$(RC_RTC_SRCS))
   OBJS-yes += $(RC_RTC_OBJS)
-  LIBS-yes += $(BUILD_PFX)libvp9rc.a $(BUILD_PFX)libvp9rc_g.a
-  $(BUILD_PFX)libvp9rc_g.a: $(RC_RTC_OBJS)
+  LIBS-yes += $(BUILD_PFX)libvpxrc.a $(BUILD_PFX)libvpxrc_g.a
+  $(BUILD_PFX)libvpxrc_g.a: $(RC_RTC_OBJS)
 endif
 
 ifeq ($(CONFIG_VP9_ENCODER)$(CONFIG_RATE_CTRL),yesyes)
@@ -493,7 +505,7 @@
                            $(call enabled,TEST_INTRA_PRED_SPEED_SRCS))
 TEST_INTRA_PRED_SPEED_OBJS := $(sort $(call objs,$(TEST_INTRA_PRED_SPEED_SRCS)))
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 RC_INTERFACE_TEST_BIN=./test_rc_interface$(EXE_SFX)
 RC_INTERFACE_TEST_SRCS=$(call addprefix_clean,test/,\
                        $(call enabled,RC_INTERFACE_TEST_SRCS))
@@ -599,11 +611,11 @@
             -L. -l$(CODEC_LIB) -l$(GTEST_LIB) $^
 endif  # TEST_INTRA_PRED_SPEED
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 PROJECTS-$(CONFIG_MSVS) += test_rc_interface.$(VCPROJ_SFX)
 test_rc_interface.$(VCPROJ_SFX): $(RC_INTERFACE_TEST_SRCS) vpx.$(VCPROJ_SFX) \
-	vp9rc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
+	vpxrc.$(VCPROJ_SFX) gtest.$(VCPROJ_SFX)
 	@echo "    [CREATE] $@"
 	$(qexec)$(GEN_VCPROJ) \
             --exe \
@@ -661,19 +673,19 @@
               -L. -lvpx -lgtest $(extralibs) -lm))
 endif  # TEST_INTRA_PRED_SPEED
 
-ifeq ($(CONFIG_VP9_ENCODER),yes)
+ifeq ($(CONFIG_ENCODERS),yes)
 ifneq ($(strip $(RC_INTERFACE_TEST_OBJS)),)
 $(RC_INTERFACE_TEST_OBJS) $(RC_INTERFACE_TEST_OBJS:.o=.d): \
   CXXFLAGS += $(GTEST_INCLUDES)
 OBJS-yes += $(RC_INTERFACE_TEST_OBJS)
 BINS-yes += $(RC_INTERFACE_TEST_BIN)
 
-$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvp9rc.a
+$(RC_INTERFACE_TEST_BIN): $(TEST_LIBS) libvpxrc.a
 $(eval $(call linkerxx_template,$(RC_INTERFACE_TEST_BIN), \
               $(RC_INTERFACE_TEST_OBJS) \
-              -L. -lvpx -lgtest -lvp9rc $(extralibs) -lm))
+              -L. -lvpx -lgtest -lvpxrc $(extralibs) -lm))
 endif  # RC_INTERFACE_TEST
-endif  # CONFIG_VP9_ENCODER
+endif  # CONFIG_ENCODERS
 
 ifneq ($(strip $(SIMPLE_ENCODE_TEST_OBJS)),)
 $(SIMPLE_ENCODE_TEST_OBJS) $(SIMPLE_ENCODE_TEST_OBJS:.o=.d): \
diff --git a/libvpx/rate_hist.c b/libvpx/rate_hist.c
index 6cf8ce7..947950d 100644
--- a/libvpx/rate_hist.c
+++ b/libvpx/rate_hist.c
@@ -193,40 +193,42 @@
 
 static void show_histogram(const struct hist_bucket *bucket, int buckets,
                            int total, int scale) {
-  const char *pat1, *pat2;
+  int width1, width2;
   int i;
 
+  if (!buckets) return;
   assert(bucket != NULL);
+  assert(buckets > 0);
 
   switch ((int)(log(bucket[buckets - 1].high) / log(10)) + 1) {
     case 1:
     case 2:
-      pat1 = "%4d %2s: ";
-      pat2 = "%4d-%2d: ";
+      width1 = 4;
+      width2 = 2;
       break;
     case 3:
-      pat1 = "%5d %3s: ";
-      pat2 = "%5d-%3d: ";
+      width1 = 5;
+      width2 = 3;
       break;
     case 4:
-      pat1 = "%6d %4s: ";
-      pat2 = "%6d-%4d: ";
+      width1 = 6;
+      width2 = 4;
       break;
     case 5:
-      pat1 = "%7d %5s: ";
-      pat2 = "%7d-%5d: ";
+      width1 = 7;
+      width2 = 5;
       break;
     case 6:
-      pat1 = "%8d %6s: ";
-      pat2 = "%8d-%6d: ";
+      width1 = 8;
+      width2 = 6;
       break;
     case 7:
-      pat1 = "%9d %7s: ";
-      pat2 = "%9d-%7d: ";
+      width1 = 9;
+      width2 = 7;
       break;
     default:
-      pat1 = "%12d %10s: ";
-      pat2 = "%12d-%10d: ";
+      width1 = 12;
+      width2 = 10;
       break;
   }
 
@@ -241,9 +243,10 @@
     assert(len <= HIST_BAR_MAX);
 
     if (bucket[i].low == bucket[i].high)
-      fprintf(stderr, pat1, bucket[i].low, "");
+      fprintf(stderr, "%*d %*s: ", width1, bucket[i].low, width2, "");
     else
-      fprintf(stderr, pat2, bucket[i].low, bucket[i].high);
+      fprintf(stderr, "%*d-%*d: ", width1, bucket[i].low, width2,
+              bucket[i].high);
 
     for (j = 0; j < HIST_BAR_MAX; j++) fprintf(stderr, j < len ? "=" : " ");
     fprintf(stderr, "\t%5d (%6.2f%%)\n", bucket[i].count, pct);
diff --git a/libvpx/test/active_map_test.cc b/libvpx/test/active_map_test.cc
index 9c55f9a..543ec0d 100644
--- a/libvpx/test/active_map_test.cc
+++ b/libvpx/test/active_map_test.cc
@@ -19,7 +19,8 @@
 
 class ActiveMapTest
     : public ::libvpx_test::EncoderTest,
-      public ::libvpx_test::CodecTestWith2Params<libvpx_test::TestMode, int> {
+      public ::libvpx_test::CodecTestWith3Params<libvpx_test::TestMode, int,
+                                                 int> {
  protected:
   static const int kWidth = 208;
   static const int kHeight = 144;
@@ -37,6 +38,7 @@
                                   ::libvpx_test::Encoder *encoder) {
     if (video->frame() == 0) {
       encoder->Control(VP8E_SET_CPUUSED, cpu_used_);
+      encoder->Control(VP9E_SET_AQ_MODE, GET_PARAM(3));
     } else if (video->frame() == 3) {
       vpx_active_map_t map = vpx_active_map_t();
       /* clang-format off */
@@ -87,5 +89,5 @@
 
 VP9_INSTANTIATE_TEST_SUITE(ActiveMapTest,
                            ::testing::Values(::libvpx_test::kRealTime),
-                           ::testing::Range(0, 9));
+                           ::testing::Range(5, 10), ::testing::Values(0, 3));
 }  // namespace
diff --git a/libvpx/test/add_noise_test.cc b/libvpx/test/add_noise_test.cc
index 25de427..7dc86e3 100644
--- a/libvpx/test/add_noise_test.cc
+++ b/libvpx/test/add_noise_test.cc
@@ -23,7 +23,6 @@
 
 static const int kNoiseSize = 3072;
 
-// TODO(jimbankoski): make width and height integers not unsigned.
 typedef void (*AddNoiseFunc)(uint8_t *start, const int8_t *noise,
                              int blackclamp, int whiteclamp, int width,
                              int height, int pitch);
diff --git a/libvpx/test/buffer.h b/libvpx/test/buffer.h
index b003d2f..023939c 100644
--- a/libvpx/test/buffer.h
+++ b/libvpx/test/buffer.h
@@ -31,7 +31,7 @@
       : width_(width), height_(height), top_padding_(top_padding),
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(0), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int top_padding, int left_padding,
          int right_padding, int bottom_padding, unsigned int alignment)
@@ -39,19 +39,19 @@
         left_padding_(left_padding), right_padding_(right_padding),
         bottom_padding_(bottom_padding), alignment_(alignment),
         padding_value_(0), stride_(0), raw_size_(0), num_elements_(0),
-        raw_buffer_(NULL) {}
+        raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(0), padding_value_(0), stride_(0),
-        raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   Buffer(int width, int height, int padding, unsigned int alignment)
       : width_(width), height_(height), top_padding_(padding),
         left_padding_(padding), right_padding_(padding),
         bottom_padding_(padding), alignment_(alignment), padding_value_(0),
-        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(NULL) {}
+        stride_(0), raw_size_(0), num_elements_(0), raw_buffer_(nullptr) {}
 
   ~Buffer() {
     if (alignment_) {
@@ -103,7 +103,7 @@
   bool CheckValues(const Buffer<T> &a) const;
 
   bool Init() {
-    if (raw_buffer_ != NULL) return false;
+    if (raw_buffer_ != nullptr) return false;
     EXPECT_GT(width_, 0);
     EXPECT_GT(height_, 0);
     EXPECT_GE(top_padding_, 0);
@@ -126,7 +126,7 @@
     } else {
       raw_buffer_ = new (std::nothrow) T[num_elements_];
     }
-    EXPECT_TRUE(raw_buffer_ != NULL);
+    EXPECT_NE(raw_buffer_, nullptr);
     SetPadding(std::numeric_limits<T>::max());
     return !::testing::Test::HasFailure();
   }
@@ -150,7 +150,7 @@
 
 template <typename T>
 T *Buffer<T>::TopLeftPixel() const {
-  if (!raw_buffer_) return NULL;
+  if (!raw_buffer_) return nullptr;
   return raw_buffer_ + (top_padding_ * stride_) + left_padding_;
 }
 
diff --git a/libvpx/test/codec_factory.h b/libvpx/test/codec_factory.h
index 77ce49d..9609261 100644
--- a/libvpx/test/codec_factory.h
+++ b/libvpx/test/codec_factory.h
@@ -88,7 +88,7 @@
 #if CONFIG_VP8_DECODER
     return &vpx_codec_vp8_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -104,7 +104,7 @@
 #if CONFIG_VP8_ENCODER
     return &vpx_codec_vp8_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -124,7 +124,7 @@
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -139,7 +139,7 @@
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -184,7 +184,7 @@
 #if CONFIG_VP9_DECODER
     return &vpx_codec_vp9_dx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -200,7 +200,7 @@
 #if CONFIG_VP9_ENCODER
     return &vpx_codec_vp9_cx_algo;
 #else
-    return NULL;
+    return nullptr;
 #endif
   }
 };
@@ -220,7 +220,7 @@
 #else
     (void)cfg;
     (void)flags;
-    return NULL;
+    return nullptr;
 #endif
   }
 
@@ -235,7 +235,7 @@
     (void)deadline;
     (void)init_flags;
     (void)stats;
-    return NULL;
+    return nullptr;
 #endif
   }
 
diff --git a/libvpx/test/comp_avg_pred_test.cc b/libvpx/test/comp_avg_pred_test.cc
index b9201a2..3977a2d 100644
--- a/libvpx/test/comp_avg_pred_test.cc
+++ b/libvpx/test/comp_avg_pred_test.cc
@@ -183,4 +183,9 @@
 INSTANTIATE_TEST_SUITE_P(VSX, AvgPredTest,
                          ::testing::Values(&vpx_comp_avg_pred_vsx));
 #endif  // HAVE_VSX
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, AvgPredTest,
+                         ::testing::Values(&vpx_comp_avg_pred_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/libvpx/test/convolve_test.cc b/libvpx/test/convolve_test.cc
index 4b2dade..d569048 100644
--- a/libvpx/test/convolve_test.cc
+++ b/libvpx/test/convolve_test.cc
@@ -1449,6 +1449,19 @@
                          ::testing::ValuesIn(kArrayConvolve8_msa));
 #endif  // HAVE_MSA
 
+#if HAVE_LSX
+const ConvolveFunctions convolve8_lsx(
+    vpx_convolve_copy_lsx, vpx_convolve_avg_lsx, vpx_convolve8_horiz_lsx,
+    vpx_convolve8_avg_horiz_lsx, vpx_convolve8_vert_lsx,
+    vpx_convolve8_avg_vert_lsx, vpx_convolve8_lsx, vpx_convolve8_avg_lsx,
+    vpx_scaled_horiz_c, vpx_scaled_avg_horiz_c, vpx_scaled_vert_c,
+    vpx_scaled_avg_vert_c, vpx_scaled_2d_c, vpx_scaled_avg_2d_c, 0);
+
+const ConvolveParam kArrayConvolve8_lsx[] = { ALL_SIZES(convolve8_lsx) };
+INSTANTIATE_TEST_SUITE_P(LSX, ConvolveTest,
+                         ::testing::ValuesIn(kArrayConvolve8_lsx));
+#endif  // HAVE_LSX
+
 #if HAVE_VSX
 const ConvolveFunctions convolve8_vsx(
     vpx_convolve_copy_vsx, vpx_convolve_avg_vsx, vpx_convolve8_horiz_vsx,
diff --git a/libvpx/test/dct16x16_test.cc b/libvpx/test/dct16x16_test.cc
index c04880e..06837d8 100644
--- a/libvpx/test/dct16x16_test.cc
+++ b/libvpx/test/dct16x16_test.cc
@@ -868,4 +868,11 @@
     ::testing::Values(make_tuple(&vpx_fdct16x16_c, &vpx_idct16x16_256_add_vsx,
                                  0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, Trans16x16DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct16x16_lsx,
+                                                      &vpx_idct16x16_256_add_c,
+                                                      0, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/dct32x32_test.cc b/libvpx/test/dct32x32_test.cc
index 8398e17..91bb8e0 100644
--- a/libvpx/test/dct32x32_test.cc
+++ b/libvpx/test/dct32x32_test.cc
@@ -396,4 +396,13 @@
                       make_tuple(&vpx_fdct32x32_rd_vsx,
                                  &vpx_idct32x32_1024_add_vsx, 1, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Trans32x32Test,
+    ::testing::Values(make_tuple(&vpx_fdct32x32_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 0, VPX_BITS_8),
+                      make_tuple(&vpx_fdct32x32_rd_lsx,
+                                 &vpx_idct32x32_1024_add_lsx, 1, VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/dct_test.cc b/libvpx/test/dct_test.cc
index 9541869..2182f87 100644
--- a/libvpx/test/dct_test.cc
+++ b/libvpx/test/dct_test.cc
@@ -586,6 +586,23 @@
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH &&
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+static const FuncInfo dct_lsx_func_info[4] = {
+  { &fdct_wrapper<vpx_fdct4x4_lsx>, &idct_wrapper<vpx_idct4x4_16_add_c>, 4, 1 },
+  { &fdct_wrapper<vpx_fdct8x8_lsx>, &idct_wrapper<vpx_idct8x8_64_add_c>, 8, 1 },
+  { &fdct_wrapper<vpx_fdct16x16_lsx>, &idct_wrapper<vpx_idct16x16_256_add_c>,
+    16, 1 },
+  { &fdct_wrapper<vpx_fdct32x32_lsx>, &idct_wrapper<vpx_idct32x32_1024_add_lsx>,
+    32, 1 }
+};
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, TransDCT,
+    ::testing::Combine(::testing::Range(0, 4),
+                       ::testing::Values(dct_lsx_func_info),
+                       ::testing::Values(0), ::testing::Values(VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 /* -------------------------------------------------------------------------- */
@@ -641,8 +658,11 @@
     &highbd_iht_wrapper<vp9_highbd_iht16x16_256_add_neon>, 16, 2 },
 #endif
   { &vp9_fht4x4_c, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
+  { &vp9_fht4x4_neon, &iht_wrapper<vp9_iht4x4_16_add_neon>, 4, 1 },
   { &vp9_fht8x8_c, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
-  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
+  { &vp9_fht8x8_neon, &iht_wrapper<vp9_iht8x8_64_add_neon>, 8, 1 },
+  { &vp9_fht16x16_c, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 },
+  { &vp9_fht16x16_neon, &iht_wrapper<vp9_iht16x16_256_add_neon>, 16, 1 }
 };
 
 INSTANTIATE_TEST_SUITE_P(
@@ -753,4 +773,5 @@
                          ::testing::Values(make_tuple(0, &wht_vsx_func_info, 0,
                                                       VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_EMULATE_HARDWARE
+
 }  // namespace
diff --git a/libvpx/test/decode_test_driver.h b/libvpx/test/decode_test_driver.h
index 04876cd..f446ab4 100644
--- a/libvpx/test/decode_test_driver.h
+++ b/libvpx/test/decode_test_driver.h
@@ -24,7 +24,7 @@
 class DxDataIterator {
  public:
   explicit DxDataIterator(vpx_codec_ctx_t *decoder)
-      : decoder_(decoder), iter_(NULL) {}
+      : decoder_(decoder), iter_(nullptr) {}
 
   const vpx_image_t *Next() { return vpx_codec_get_frame(decoder_, &iter_); }
 
diff --git a/libvpx/test/encode_api_test.cc b/libvpx/test/encode_api_test.cc
index dec19b2..6f61c77 100644
--- a/libvpx/test/encode_api_test.cc
+++ b/libvpx/test/encode_api_test.cc
@@ -10,16 +10,25 @@
 
 #include <climits>
 #include <cstring>
+#include <initializer_list>
 
 #include "third_party/googletest/src/include/gtest/gtest.h"
 
 #include "./vpx_config.h"
+#include "test/video_source.h"
 #include "vpx/vp8cx.h"
 #include "vpx/vpx_encoder.h"
 
 namespace {
 
-#define NELEMENTS(x) static_cast<int>(sizeof(x) / sizeof(x[0]))
+const vpx_codec_iface_t *kCodecIfaces[] = {
+#if CONFIG_VP8_ENCODER
+  &vpx_codec_vp8_cx_algo,
+#endif
+#if CONFIG_VP9_ENCODER
+  &vpx_codec_vp9_cx_algo,
+#endif
+};
 
 bool IsVP9(const vpx_codec_iface_t *iface) {
   static const char kVP9Name[] = "WebM Project VP9";
@@ -28,14 +37,6 @@
 }
 
 TEST(EncodeAPI, InvalidParams) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
   uint8_t buf[1] = { 0 };
   vpx_image_t img;
   vpx_codec_ctx_t enc;
@@ -58,17 +59,17 @@
             vpx_codec_enc_config_default(nullptr, &cfg, 0));
   EXPECT_NE(vpx_codec_error(nullptr), nullptr);
 
-  for (int i = 0; i < NELEMENTS(kCodecs); ++i) {
-    SCOPED_TRACE(vpx_codec_iface_name(kCodecs[i]));
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(nullptr, kCodecs[i], nullptr, 0));
+              vpx_codec_enc_init(nullptr, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_init(&enc, kCodecs[i], nullptr, 0));
+              vpx_codec_enc_init(&enc, iface, nullptr, 0));
     EXPECT_EQ(VPX_CODEC_INVALID_PARAM,
-              vpx_codec_enc_config_default(kCodecs[i], &cfg, 1));
+              vpx_codec_enc_config_default(iface, &cfg, 1));
 
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(kCodecs[i], &cfg, 0));
-    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, kCodecs[i], &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_config_default(iface, &cfg, 0));
+    EXPECT_EQ(VPX_CODEC_OK, vpx_codec_enc_init(&enc, iface, &cfg, 0));
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_encode(&enc, nullptr, 0, 0, 0, 0));
 
     EXPECT_EQ(VPX_CODEC_OK, vpx_codec_destroy(&enc));
@@ -124,14 +125,6 @@
 // (ts_target_bitrate[]) to 0 for both layers. This should fail independent of
 // CONFIG_MULTI_RES_ENCODING.
 TEST(EncodeAPI, MultiResEncode) {
-  static const vpx_codec_iface_t *kCodecs[] = {
-#if CONFIG_VP8_ENCODER
-    &vpx_codec_vp8_cx_algo,
-#endif
-#if CONFIG_VP9_ENCODER
-    &vpx_codec_vp9_cx_algo,
-#endif
-  };
   const int width = 1280;
   const int height = 720;
   const int width_down = width / 2;
@@ -139,8 +132,7 @@
   const int target_bitrate = 1000;
   const int framerate = 30;
 
-  for (int c = 0; c < NELEMENTS(kCodecs); ++c) {
-    const vpx_codec_iface_t *const iface = kCodecs[c];
+  for (const auto *iface : kCodecIfaces) {
     vpx_codec_ctx_t enc[2];
     vpx_codec_enc_cfg_t cfg[2];
     vpx_rational_t dsf[2] = { { 2, 1 }, { 2, 1 } };
@@ -310,4 +302,62 @@
   }
 }
 
+void InitCodec(const vpx_codec_iface_t &iface, int width, int height,
+               vpx_codec_ctx_t *enc, vpx_codec_enc_cfg_t *cfg) {
+  ASSERT_EQ(vpx_codec_enc_config_default(&iface, cfg, 0), VPX_CODEC_OK);
+  cfg->g_w = width;
+  cfg->g_h = height;
+  cfg->g_lag_in_frames = 0;
+  cfg->g_pass = VPX_RC_ONE_PASS;
+  ASSERT_EQ(vpx_codec_enc_init(enc, &iface, cfg, 0), VPX_CODEC_OK);
+
+  ASSERT_EQ(vpx_codec_control_(enc, VP8E_SET_CPUUSED, 2), VPX_CODEC_OK);
+}
+
+// Encodes 1 frame of size |cfg.g_w| x |cfg.g_h| setting |enc|'s configuration
+// to |cfg|.
+void EncodeWithConfig(const vpx_codec_enc_cfg_t &cfg, vpx_codec_ctx_t *enc) {
+  libvpx_test::DummyVideoSource video;
+  video.SetSize(cfg.g_w, cfg.g_h);
+  video.Begin();
+  EXPECT_EQ(vpx_codec_enc_config_set(enc, &cfg), VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+
+  EXPECT_EQ(vpx_codec_encode(enc, video.img(), video.pts(), video.duration(),
+                             /*flags=*/0, VPX_DL_GOOD_QUALITY),
+            VPX_CODEC_OK)
+      << vpx_codec_error_detail(enc);
+}
+
+TEST(EncodeAPI, ConfigChangeThreadCount) {
+  constexpr int kWidth = 1920;
+  constexpr int kHeight = 1080;
+
+  for (const auto *iface : kCodecIfaces) {
+    SCOPED_TRACE(vpx_codec_iface_name(iface));
+    for (int i = 0; i < (IsVP9(iface) ? 2 : 1); ++i) {
+      vpx_codec_enc_cfg_t cfg;
+      struct Encoder {
+        ~Encoder() { EXPECT_EQ(vpx_codec_destroy(&ctx), VPX_CODEC_OK); }
+        vpx_codec_ctx_t ctx = {};
+      } enc;
+
+      EXPECT_NO_FATAL_FAILURE(
+          InitCodec(*iface, kWidth, kHeight, &enc.ctx, &cfg));
+      if (IsVP9(iface)) {
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_TILE_COLUMNS, 6),
+                  VPX_CODEC_OK);
+        EXPECT_EQ(vpx_codec_control_(&enc.ctx, VP9E_SET_ROW_MT, i),
+                  VPX_CODEC_OK);
+      }
+
+      for (const auto threads : { 1, 4, 8, 6, 2, 1 }) {
+        cfg.g_threads = threads;
+        EXPECT_NO_FATAL_FAILURE(EncodeWithConfig(cfg, &enc.ctx))
+            << "iteration: " << i << " threads: " << threads;
+      }
+    }
+  }
+}
+
 }  // namespace
diff --git a/libvpx/test/encode_test_driver.h b/libvpx/test/encode_test_driver.h
index 38c6195..7085945 100644
--- a/libvpx/test/encode_test_driver.h
+++ b/libvpx/test/encode_test_driver.h
@@ -49,7 +49,7 @@
 class CxDataIterator {
  public:
   explicit CxDataIterator(vpx_codec_ctx_t *encoder)
-      : encoder_(encoder), iter_(NULL) {}
+      : encoder_(encoder), iter_(nullptr) {}
 
   const vpx_codec_cx_pkt_t *Next() {
     return vpx_codec_get_cx_data(encoder_, &iter_);
diff --git a/libvpx/test/fdct8x8_test.cc b/libvpx/test/fdct8x8_test.cc
index 0822666..83d1ff1 100644
--- a/libvpx/test/fdct8x8_test.cc
+++ b/libvpx/test/fdct8x8_test.cc
@@ -768,4 +768,11 @@
                                                       &vpx_idct8x8_64_add_vsx,
                                                       0, VPX_BITS_8)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
+INSTANTIATE_TEST_SUITE_P(LSX, FwdTrans8x8DCT,
+                         ::testing::Values(make_tuple(&vpx_fdct8x8_lsx,
+                                                      &vpx_idct8x8_64_add_c, 0,
+                                                      VPX_BITS_8)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH && !CONFIG_EMULATE_HARDWARE
 }  // namespace
diff --git a/libvpx/test/hadamard_test.cc b/libvpx/test/hadamard_test.cc
index dab945a..10b1e79 100644
--- a/libvpx/test/hadamard_test.cc
+++ b/libvpx/test/hadamard_test.cc
@@ -285,6 +285,13 @@
                       HadamardFuncWithSize(&vpx_hadamard_16x16_vsx, 16)));
 #endif  // HAVE_VSX
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, HadamardLowbdTest,
+    ::testing::Values(HadamardFuncWithSize(&vpx_hadamard_8x8_lsx, 8),
+                      HadamardFuncWithSize(&vpx_hadamard_16x16_lsx, 16)));
+#endif  // HAVE_LSX
+
 #if CONFIG_VP9_HIGHBITDEPTH
 class HadamardHighbdTest : public HadamardTestBase {
  protected:
diff --git a/libvpx/test/ivf_video_source.h b/libvpx/test/ivf_video_source.h
index 22c05ec..a8ac4f1 100644
--- a/libvpx/test/ivf_video_source.h
+++ b/libvpx/test/ivf_video_source.h
@@ -29,8 +29,9 @@
 class IVFVideoSource : public CompressedVideoSource {
  public:
   explicit IVFVideoSource(const std::string &file_name)
-      : file_name_(file_name), input_file_(NULL), compressed_frame_buf_(NULL),
-        frame_sz_(0), frame_(0), end_of_file_(false) {}
+      : file_name_(file_name), input_file_(nullptr),
+        compressed_frame_buf_(nullptr), frame_sz_(0), frame_(0),
+        end_of_file_(false) {}
 
   virtual ~IVFVideoSource() {
     delete[] compressed_frame_buf_;
@@ -41,13 +42,12 @@
   virtual void Init() {
     // Allocate a buffer for read in the compressed video frame.
     compressed_frame_buf_ = new uint8_t[libvpx_test::kCodeBufferSize];
-    ASSERT_TRUE(compressed_frame_buf_ != NULL)
-        << "Allocate frame buffer failed";
+    ASSERT_NE(compressed_frame_buf_, nullptr) << "Allocate frame buffer failed";
   }
 
   virtual void Begin() {
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     // Read file header
@@ -68,7 +68,7 @@
   }
 
   void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     uint8_t frame_hdr[kIvfFrameHdrSize];
     // Check frame header and read a frame from input_file.
     if (fread(frame_hdr, 1, kIvfFrameHdrSize, input_file_) !=
@@ -87,7 +87,7 @@
   }
 
   virtual const uint8_t *cxdata() const {
-    return end_of_file_ ? NULL : compressed_frame_buf_;
+    return end_of_file_ ? nullptr : compressed_frame_buf_;
   }
   virtual size_t frame_size() const { return frame_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
diff --git a/libvpx/test/lpf_test.cc b/libvpx/test/lpf_test.cc
index 62c6f30..4cc99a6 100644
--- a/libvpx/test/lpf_test.cc
+++ b/libvpx/test/lpf_test.cc
@@ -147,7 +147,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test6Param);
 
-#if HAVE_NEON || HAVE_SSE2 || \
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH) || \
     (HAVE_DSPR2 || HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH)
 class Loop8Test9Param : public ::testing::TestWithParam<dualloop8_param_t> {
  public:
@@ -169,7 +169,7 @@
 };
 GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(Loop8Test9Param);
 #endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
-        // (!CONFIG_VP9_HIGHBITDEPTH))
+        // (!CONFIG_VP9_HIGHBITDEPTH) || (HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH))
 
 TEST_P(Loop8Test6Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -281,7 +281,7 @@
       << "First failed at test case " << first_failure;
 }
 
-#if HAVE_NEON || HAVE_SSE2 || \
+#if HAVE_NEON || HAVE_SSE2 || (HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)) || \
     (HAVE_DSPR2 || HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH))
 TEST_P(Loop8Test9Param, OperationCheck) {
   ACMRandom rnd(ACMRandom::DeterministicSeed());
@@ -411,6 +411,7 @@
       << "First failed at test case " << first_failure;
 }
 #endif  // HAVE_NEON || HAVE_SSE2 || (HAVE_DSPR2 || HAVE_MSA &&
+        // (!CONFIG_VP9_HIGHBITDEPTH)) || (HAVE_LSX &&
         // (!CONFIG_VP9_HIGHBITDEPTH))
 
 using std::make_tuple;
@@ -692,4 +693,29 @@
                                  &vpx_lpf_vertical_8_dual_c, 8)));
 #endif  // HAVE_MSA && (!CONFIG_VP9_HIGHBITDEPTH)
 
+#if HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test6Param,
+    ::testing::Values(
+        make_tuple(&vpx_lpf_horizontal_4_lsx, &vpx_lpf_horizontal_4_c, 8),
+        make_tuple(&vpx_lpf_horizontal_8_lsx, &vpx_lpf_horizontal_8_c, 8),
+        make_tuple(&vpx_lpf_horizontal_16_dual_lsx,
+                   &vpx_lpf_horizontal_16_dual_c, 8),
+        make_tuple(&vpx_lpf_vertical_4_lsx, &vpx_lpf_vertical_4_c, 8),
+        make_tuple(&vpx_lpf_vertical_8_lsx, &vpx_lpf_vertical_8_c, 8),
+        make_tuple(&vpx_lpf_vertical_16_dual_lsx, &vpx_lpf_vertical_16_dual_c,
+                   8)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, Loop8Test9Param,
+    ::testing::Values(make_tuple(&vpx_lpf_horizontal_4_dual_lsx,
+                                 &vpx_lpf_horizontal_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_horizontal_8_dual_lsx,
+                                 &vpx_lpf_horizontal_8_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_4_dual_lsx,
+                                 &vpx_lpf_vertical_4_dual_c, 8),
+                      make_tuple(&vpx_lpf_vertical_8_dual_lsx,
+                                 &vpx_lpf_vertical_8_dual_c, 8)));
+#endif  // HAVE_LSX && (!CONFIG_VP9_HIGHBITDEPTH)
+
 }  // namespace
diff --git a/libvpx/test/partial_idct_test.cc b/libvpx/test/partial_idct_test.cc
index a160120..7eb888a 100644
--- a/libvpx/test/partial_idct_test.cc
+++ b/libvpx/test/partial_idct_test.cc
@@ -954,6 +954,20 @@
                          ::testing::ValuesIn(msa_partial_idct_tests));
 #endif  // HAVE_MSA && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+const PartialInvTxfmParam lsx_partial_idct_tests[] = {
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1024_add_c>,
+             &wrapper<vpx_idct32x32_1024_add_lsx>, TX_32X32, 1024, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_34_add_c>,
+             &wrapper<vpx_idct32x32_34_add_lsx>, TX_32X32, 34, 8, 1),
+  make_tuple(&vpx_fdct32x32_c, &wrapper<vpx_idct32x32_1_add_c>,
+             &wrapper<vpx_idct32x32_1_add_lsx>, TX_32X32, 1, 8, 1),
+};
+
+INSTANTIATE_TEST_SUITE_P(LSX, PartialIDctTest,
+                         ::testing::ValuesIn(lsx_partial_idct_tests));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 #endif  // !CONFIG_EMULATE_HARDWARE
 
 }  // namespace
diff --git a/libvpx/test/pp_filter_test.cc b/libvpx/test/pp_filter_test.cc
index a511ffb..775f7f3 100644
--- a/libvpx/test/pp_filter_test.cc
+++ b/libvpx/test/pp_filter_test.cc
@@ -115,7 +115,7 @@
   }
 
   vpx_free(flimits_);
-};
+}
 
 TEST_P(VpxPostProcDownAndAcrossMbRowTest, CheckCvsAssembly) {
   // Size of the underlying data block that will be filtered.
@@ -214,7 +214,7 @@
   PrintMedian("16x16");
 
   vpx_free(flimits_);
-};
+}
 
 class VpxMbPostProcAcrossIpTest
     : public AbstractBench,
diff --git a/libvpx/test/quantize_test.cc b/libvpx/test/quantize_test.cc
index 792b214..57309e8 100644
--- a/libvpx/test/quantize_test.cc
+++ b/libvpx/test/quantize_test.cc
@@ -224,4 +224,11 @@
         make_tuple(&vp8_fast_quantize_b_mmi, &vp8_fast_quantize_b_c),
         make_tuple(&vp8_regular_quantize_b_mmi, &vp8_regular_quantize_b_c)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(
+    LSX, QuantizeTest,
+    ::testing::Values(make_tuple(&vp8_regular_quantize_b_lsx,
+                                 &vp8_regular_quantize_b_c)));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/libvpx/test/realtime_test.cc b/libvpx/test/realtime_test.cc
index b32a355..c5de2dc 100644
--- a/libvpx/test/realtime_test.cc
+++ b/libvpx/test/realtime_test.cc
@@ -35,17 +35,19 @@
   }
 
   void BeginPassHook(unsigned int /*pass*/) override {
+#if !CONFIG_REALTIME_ONLY
     // TODO(tomfinegan): We're changing the pass value here to make sure
     // we get frames when real time mode is combined with |g_pass| set to
     // VPX_RC_FIRST_PASS. This is necessary because EncoderTest::RunLoop() sets
     // the pass value based on the mode passed into EncoderTest::SetMode(),
     // which overrides the one specified in SetUp() above.
     cfg_.g_pass = VPX_RC_FIRST_PASS;
+#endif
   }
 
   void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                           ::libvpx_test::Encoder *encoder) override {
-    if (video->frame() == 0) {
+    if (video->frame() == 0 && set_cpu_used_) {
       encoder->Control(VP8E_SET_CPUUSED, 8);
     }
   }
@@ -70,31 +72,34 @@
     ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
   }
 
+  void TestEncode() {
+    ::libvpx_test::RandomVideoSource video;
+    video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
+    video.set_limit(kFramesToEncode);
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+    EXPECT_EQ(kFramesToEncode, frame_packets_);
+  }
+
   int frame_packets_;
+  bool set_cpu_used_ = true;
 };
 
-TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) {
-  ::libvpx_test::RandomVideoSource video;
-  video.SetSize(kVideoSourceWidth, kVideoSourceHeight);
-  video.set_limit(kFramesToEncode);
-  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  EXPECT_EQ(kFramesToEncode, frame_packets_);
+TEST_P(RealtimeTest, RealtimeFirstPassProducesFrames) { TestEncode(); }
+
+TEST_P(RealtimeTest, RealtimeDefaultCpuUsed) {
+  set_cpu_used_ = false;
+  TestEncode();
 }
 
-TEST_P(RealtimeTest, IntegerOverflow) {
-  if (IsVP9()) {
-    // TODO(https://crbug.com/webm/1749): This should match VP8.
-    TestIntegerOverflow(800, 480);
-  } else {
-    TestIntegerOverflow(2048, 2048);
-  }
-}
+TEST_P(RealtimeTest, IntegerOverflow) { TestIntegerOverflow(2048, 2048); }
 
 TEST_P(RealtimeTest, IntegerOverflowLarge) {
   if (IsVP9()) {
-    GTEST_SKIP() << "TODO(https://crbug.com/webm/1750): Enable this test after "
-                    "undefined sanitizer warnings are fixed.";
-    // TestIntegerOverflow(16384, 16384);
+#if VPX_ARCH_X86_64
+    TestIntegerOverflow(16384, 16384);
+#else
+    TestIntegerOverflow(4096, 4096);
+#endif
   } else {
     GTEST_SKIP()
         << "TODO(https://crbug.com/webm/1748,https://crbug.com/webm/1751):"
diff --git a/libvpx/test/register_state_check.h b/libvpx/test/register_state_check.h
index 4366466..0b837dd 100644
--- a/libvpx/test/register_state_check.h
+++ b/libvpx/test/register_state_check.h
@@ -35,6 +35,7 @@
 #ifndef WIN32_LEAN_AND_MEAN
 #define WIN32_LEAN_AND_MEAN
 #endif
+#include <intrin.h>
 #include <windows.h>
 #include <winnt.h>
 
@@ -55,7 +56,7 @@
  private:
   static bool StoreRegisters(CONTEXT *const context) {
     const HANDLE this_thread = GetCurrentThread();
-    EXPECT_TRUE(this_thread != NULL);
+    EXPECT_NE(this_thread, nullptr);
     context->ContextFlags = CONTEXT_FLOATING_POINT;
     const bool context_saved = GetThreadContext(this_thread, context) == TRUE;
     EXPECT_TRUE(context_saved) << "GetLastError: " << GetLastError();
@@ -81,10 +82,13 @@
   CONTEXT pre_context_;
 };
 
-#define ASM_REGISTER_STATE_CHECK(statement)    \
-  do {                                         \
-    libvpx_test::RegisterStateCheck reg_check; \
-    statement;                                 \
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    _ReadWriteBarrier();                         \
   } while (false)
 
 }  // namespace libvpx_test
@@ -121,11 +125,22 @@
   int64_t pre_store_[8];
 };
 
+#if defined(__GNUC__)
+#define ASM_REGISTER_STATE_CHECK(statement)      \
+  do {                                           \
+    {                                            \
+      libvpx_test::RegisterStateCheck reg_check; \
+      statement;                                 \
+    }                                            \
+    __asm__ volatile("" ::: "memory");           \
+  } while (false)
+#else
 #define ASM_REGISTER_STATE_CHECK(statement)    \
   do {                                         \
     libvpx_test::RegisterStateCheck reg_check; \
     statement;                                 \
   } while (false)
+#endif
 
 }  // namespace libvpx_test
 
@@ -169,10 +184,13 @@
   uint16_t pre_fpu_env_[14];
 };
 
-#define API_REGISTER_STATE_CHECK(statement)       \
-  do {                                            \
-    libvpx_test::RegisterStateCheckMMX reg_check; \
-    ASM_REGISTER_STATE_CHECK(statement);          \
+#define API_REGISTER_STATE_CHECK(statement)         \
+  do {                                              \
+    {                                               \
+      libvpx_test::RegisterStateCheckMMX reg_check; \
+      ASM_REGISTER_STATE_CHECK(statement);          \
+    }                                               \
+    __asm__ volatile("" ::: "memory");              \
   } while (false)
 
 }  // namespace libvpx_test
diff --git a/libvpx/test/sad_test.cc b/libvpx/test/sad_test.cc
index ee10a46..2506f1a 100644
--- a/libvpx/test/sad_test.cc
+++ b/libvpx/test/sad_test.cc
@@ -56,8 +56,6 @@
                              const uint8_t *ref_ptr, int ref_stride,
                              unsigned int *sad_array);
 
-typedef TestParams<SadMxNx8Func> SadMxNx8Param;
-
 using libvpx_test::ACMRandom;
 
 namespace {
@@ -266,30 +264,6 @@
   ParamType params_;
 };
 
-class SADx8Test : public SADTestBase<SadMxNx8Param> {
- public:
-  SADx8Test() : SADTestBase(GetParam()) {}
-
- protected:
-  void SADs(unsigned int *results) const {
-    const uint8_t *reference = GetReferenceFromOffset(0);
-
-    ASM_REGISTER_STATE_CHECK(params_.func(
-        source_data_, source_stride_, reference, reference_stride_, results));
-  }
-
-  void CheckSADs() const {
-    uint32_t reference_sad;
-    DECLARE_ALIGNED(kDataAlignment, uint32_t, exp_sad[8]);
-
-    SADs(exp_sad);
-    for (int offset = 0; offset < 8; ++offset) {
-      reference_sad = ReferenceSAD(offset);
-      EXPECT_EQ(reference_sad, exp_sad[offset]) << "offset " << offset;
-    }
-  }
-};
-
 class SADx4Test : public SADTestBase<SadMxNx4Param> {
  public:
   SADx4Test() : SADTestBase(GetParam()) {}
@@ -564,13 +538,6 @@
   reference_stride_ = tmp_stride;
 }
 
-TEST_P(SADx8Test, Regular) {
-  FillRandomWH(source_data_, source_stride_, params_.width, params_.height);
-  FillRandomWH(GetReferenceFromOffset(0), reference_stride_, params_.width + 8,
-               params_.height);
-  CheckSADs();
-}
-
 //------------------------------------------------------------------------------
 // C functions
 const SadMxNParam c_tests[] = {
@@ -747,24 +714,6 @@
 };
 INSTANTIATE_TEST_SUITE_P(C, SADx4Test, ::testing::ValuesIn(x4d_c_tests));
 
-// TODO(angiebird): implement the marked-down sad functions
-const SadMxNx8Param x8_c_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  // SadMxNx8Param(64, 32, &vpx_sad64x32x8_c),
-  // SadMxNx8Param(32, 64, &vpx_sad32x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_c),
-  // SadMxNx8Param(32, 16, &vpx_sad32x16x8_c),
-  // SadMxNx8Param(16, 32, &vpx_sad16x32x8_c),
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_c),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_c),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_c),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_c),
-  // SadMxNx8Param(8, 4, &vpx_sad8x4x8_c),
-  // SadMxNx8Param(4, 8, &vpx_sad4x8x8_c),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_c),
-};
-INSTANTIATE_TEST_SUITE_P(C, SADx8Test, ::testing::ValuesIn(x8_c_tests));
-
 //------------------------------------------------------------------------------
 // ARM functions
 #if HAVE_NEON
@@ -992,18 +941,6 @@
 // Only functions are x3, which do not have tests.
 #endif  // HAVE_SSSE3
 
-#if HAVE_SSE4_1
-const SadMxNx8Param x8_sse4_1_tests[] = {
-  SadMxNx8Param(16, 16, &vpx_sad16x16x8_sse4_1),
-  SadMxNx8Param(16, 8, &vpx_sad16x8x8_sse4_1),
-  SadMxNx8Param(8, 16, &vpx_sad8x16x8_sse4_1),
-  SadMxNx8Param(8, 8, &vpx_sad8x8x8_sse4_1),
-  SadMxNx8Param(4, 4, &vpx_sad4x4x8_sse4_1),
-};
-INSTANTIATE_TEST_SUITE_P(SSE4_1, SADx8Test,
-                         ::testing::ValuesIn(x8_sse4_1_tests));
-#endif  // HAVE_SSE4_1
-
 #if HAVE_AVX2
 const SadMxNParam avx2_tests[] = {
   SadMxNParam(64, 64, &vpx_sad64x64_avx2),
@@ -1029,11 +966,6 @@
 };
 INSTANTIATE_TEST_SUITE_P(AVX2, SADx4Test, ::testing::ValuesIn(x4d_avx2_tests));
 
-const SadMxNx8Param x8_avx2_tests[] = {
-  // SadMxNx8Param(64, 64, &vpx_sad64x64x8_c),
-  SadMxNx8Param(32, 32, &vpx_sad32x32x8_avx2),
-};
-INSTANTIATE_TEST_SUITE_P(AVX2, SADx8Test, ::testing::ValuesIn(x8_avx2_tests));
 #endif  // HAVE_AVX2
 
 #if HAVE_AVX512
@@ -1196,4 +1128,33 @@
 };
 INSTANTIATE_TEST_SUITE_P(MMI, SADx4Test, ::testing::ValuesIn(x4d_mmi_tests));
 #endif  // HAVE_MMI
+
+//------------------------------------------------------------------------------
+// loongarch functions
+#if HAVE_LSX
+const SadMxNParam lsx_tests[] = {
+  SadMxNParam(64, 64, &vpx_sad64x64_lsx),
+  SadMxNParam(32, 32, &vpx_sad32x32_lsx),
+  SadMxNParam(16, 16, &vpx_sad16x16_lsx),
+  SadMxNParam(8, 8, &vpx_sad8x8_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADTest, ::testing::ValuesIn(lsx_tests));
+
+const SadMxNAvgParam avg_lsx_tests[] = {
+  SadMxNAvgParam(64, 64, &vpx_sad64x64_avg_lsx),
+  SadMxNAvgParam(32, 32, &vpx_sad32x32_avg_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADavgTest, ::testing::ValuesIn(avg_lsx_tests));
+
+const SadMxNx4Param x4d_lsx_tests[] = {
+  SadMxNx4Param(64, 64, &vpx_sad64x64x4d_lsx),
+  SadMxNx4Param(64, 32, &vpx_sad64x32x4d_lsx),
+  SadMxNx4Param(32, 64, &vpx_sad32x64x4d_lsx),
+  SadMxNx4Param(32, 32, &vpx_sad32x32x4d_lsx),
+  SadMxNx4Param(16, 16, &vpx_sad16x16x4d_lsx),
+  SadMxNx4Param(8, 8, &vpx_sad8x8x4d_lsx),
+};
+INSTANTIATE_TEST_SUITE_P(LSX, SADx4Test, ::testing::ValuesIn(x4d_lsx_tests));
+#endif  // HAVE_LSX
+
 }  // namespace
diff --git a/libvpx/test/set_roi.cc b/libvpx/test/set_roi.cc
index f639547..167cf90 100644
--- a/libvpx/test/set_roi.cc
+++ b/libvpx/test/set_roi.cc
@@ -161,6 +161,6 @@
   // Free allocated memory
   if (cpi.segmentation_map) vpx_free(cpi.segmentation_map);
   if (roi_map) vpx_free(roi_map);
-};
+}
 
 }  // namespace
diff --git a/libvpx/test/simple_encode_test.cc b/libvpx/test/simple_encode_test.cc
index 03e28e3..01fc258 100644
--- a/libvpx/test/simple_encode_test.cc
+++ b/libvpx/test/simple_encode_test.cc
@@ -37,13 +37,14 @@
   const int frame_rate_den_ = 1;
   const int target_bitrate_ = 1000;
   const int num_frames_ = 17;
+  const int target_level_ = LEVEL_UNKNOWN;
   const std::string in_file_path_str_ =
       libvpx_test::GetDataPath() + "/bus_352x288_420_f20_b8.yuv";
 };
 
 TEST_F(SimpleEncodeTest, ComputeFirstPassStats) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<std::vector<double>> frame_stats =
@@ -64,7 +65,7 @@
 
 TEST_F(SimpleEncodeTest, ObserveFirstPassMotionVectors) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<std::vector<MotionVectorInfo>> fps_motion_vectors =
@@ -86,7 +87,7 @@
 
 TEST_F(SimpleEncodeTest, GetCodingFrameNum) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -95,7 +96,7 @@
 
 TEST_F(SimpleEncodeTest, EncodeFrame) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -138,7 +139,7 @@
 
 TEST_F(SimpleEncodeTest, ObserveKeyFrameMap) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   std::vector<int> key_frame_map = simple_encode.ObserveKeyFrameMap();
@@ -167,7 +168,7 @@
 
 TEST_F(SimpleEncodeTest, EncodeFrameWithTargetFrameBits) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -205,7 +206,7 @@
 
 TEST_F(SimpleEncodeTest, EncodeFrameWithQuantizeIndex) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -237,7 +238,7 @@
     // The first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     simple_encode.StartEncode();
@@ -257,7 +258,7 @@
     // The second encode with quantize index got from the first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
     EXPECT_EQ(static_cast<size_t>(num_coding_frames),
@@ -286,7 +287,7 @@
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -309,7 +310,7 @@
   // The second encode.
   SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -357,7 +358,7 @@
   const int num_units_4x4 = num_rows_4x4 * num_cols_4x4;
   // The first encode.
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -377,7 +378,7 @@
   // The second encode.
   SimpleEncode simple_encode_2(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
   simple_encode_2.ComputeFirstPassStats();
   const int num_coding_frames_2 = simple_encode_2.GetCodingFrameNum();
   simple_encode_2.StartEncode();
@@ -417,7 +418,7 @@
     // The first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     simple_encode.StartEncode();
 
@@ -449,7 +450,7 @@
     // The external arfs are the same as the first encode.
     SimpleEncode simple_encode(width_, height_, frame_rate_num_,
                                frame_rate_den_, target_bitrate_, num_frames_,
-                               in_file_path_str_.c_str());
+                               target_level_, in_file_path_str_.c_str());
     simple_encode.ComputeFirstPassStats();
     simple_encode.SetExternalGroupOfPicturesMap(gop_map.data(), gop_map.size());
     const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -471,7 +472,7 @@
 
 TEST_F(SimpleEncodeTest, SetExternalGroupOfPicturesMap) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
 
@@ -541,7 +542,7 @@
   // Makes sure that the encode_frame_info obtained from GetEncodeFrameInfo()
   // matches the counterpart in encode_frame_result obtained from EncodeFrame()
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   simple_encode.ComputeFirstPassStats();
   const int num_coding_frames = simple_encode.GetCodingFrameNum();
@@ -558,7 +559,7 @@
 
 TEST_F(SimpleEncodeTest, GetFramePixelCount) {
   SimpleEncode simple_encode(width_, height_, frame_rate_num_, frame_rate_den_,
-                             target_bitrate_, num_frames_,
+                             target_bitrate_, num_frames_, target_level_,
                              in_file_path_str_.c_str());
   EXPECT_EQ(simple_encode.GetFramePixelCount(),
             static_cast<uint64_t>(width_ * height_ * 3 / 2));
diff --git a/libvpx/test/svc_datarate_test.cc b/libvpx/test/svc_datarate_test.cc
index 95d82ce..291cb01 100644
--- a/libvpx/test/svc_datarate_test.cc
+++ b/libvpx/test/svc_datarate_test.cc
@@ -1354,7 +1354,6 @@
   ResetModel();
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
-  // TODO(jianj): webm:1554
   CheckLayerRateTargeting(number_spatial_layers_, number_temporal_layers_, 0.70,
                           1.15);
 #if CONFIG_VP9_DECODER
diff --git a/libvpx/test/svc_end_to_end_test.cc b/libvpx/test/svc_end_to_end_test.cc
index 518824d..7300ce6 100644
--- a/libvpx/test/svc_end_to_end_test.cc
+++ b/libvpx/test/svc_end_to_end_test.cc
@@ -15,6 +15,7 @@
 #include "test/svc_test.h"
 #include "test/util.h"
 #include "test/y4m_video_source.h"
+#include "vp9/common/vp9_onyxc_int.h"
 #include "vpx/vpx_codec.h"
 #include "vpx_ports/bitops.h"
 
@@ -139,6 +140,91 @@
     return current_video_frame_ >= frame_to_start_decode_;
   }
 
+  // Example pattern for spatial layers and 2 temporal layers used in the
+  // bypass/flexible mode. The pattern corresponds to the pattern
+  // VP9E_TEMPORAL_LAYERING_MODE_0101 (temporal_layering_mode == 2) used in
+  // non-flexible mode.
+  void set_frame_flags_bypass_mode(
+      int tl, int num_spatial_layers, int is_key_frame,
+      vpx_svc_ref_frame_config_t *ref_frame_config) {
+    int sl;
+    for (sl = 0; sl < num_spatial_layers; ++sl)
+      ref_frame_config->update_buffer_slot[sl] = 0;
+
+    for (sl = 0; sl < num_spatial_layers; ++sl) {
+      // Set the buffer idx.
+      if (tl == 0) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        if (sl) {
+          if (is_key_frame) {
+            ref_frame_config->lst_fb_idx[sl] = sl - 1;
+            ref_frame_config->gld_fb_idx[sl] = sl;
+          } else {
+            ref_frame_config->gld_fb_idx[sl] = sl - 1;
+          }
+        } else {
+          ref_frame_config->gld_fb_idx[sl] = 0;
+        }
+        ref_frame_config->alt_fb_idx[sl] = 0;
+      } else if (tl == 1) {
+        ref_frame_config->lst_fb_idx[sl] = sl;
+        ref_frame_config->gld_fb_idx[sl] =
+            (sl == 0) ? 0 : num_spatial_layers + sl - 1;
+        ref_frame_config->alt_fb_idx[sl] = num_spatial_layers + sl;
+      }
+      // Set the reference and update flags.
+      if (!tl) {
+        if (!sl) {
+          // Base spatial and base temporal (sl = 0, tl = 0)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->lst_fb_idx[sl];
+        } else {
+          if (is_key_frame) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 0;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->gld_fb_idx[sl];
+          } else {
+            // Non-zero spatiall layer.
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 1;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->lst_fb_idx[sl];
+          }
+        }
+      } else if (tl == 1) {
+        if (!sl) {
+          // Base spatial and top temporal (tl = 1)
+          ref_frame_config->reference_last[sl] = 1;
+          ref_frame_config->reference_golden[sl] = 0;
+          ref_frame_config->reference_alt_ref[sl] = 0;
+          ref_frame_config->update_buffer_slot[sl] |=
+              1 << ref_frame_config->alt_fb_idx[sl];
+        } else {
+          // Non-zero spatial.
+          if (sl < num_spatial_layers - 1) {
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+            ref_frame_config->update_buffer_slot[sl] |=
+                1 << ref_frame_config->alt_fb_idx[sl];
+          } else if (sl == num_spatial_layers - 1) {
+            // Top spatial and top temporal (non-reference -- doesn't
+            // update any reference buffers).
+            ref_frame_config->reference_last[sl] = 1;
+            ref_frame_config->reference_golden[sl] = 1;
+            ref_frame_config->reference_alt_ref[sl] = 0;
+          }
+        }
+      }
+    }
+  }
+
   virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
                                   ::libvpx_test::Encoder *encoder) {
     current_video_frame_ = video->frame();
@@ -158,6 +244,21 @@
 
       encoder->Control(VP9E_SET_DISABLE_LOOPFILTER, loopfilter_off_);
     }
+    if (flexible_mode_) {
+      vpx_svc_layer_id_t layer_id;
+      layer_id.spatial_layer_id = 0;
+      layer_id.temporal_layer_id = (video->frame() % 2 != 0);
+      temporal_layer_id_ = layer_id.temporal_layer_id;
+      for (int i = 0; i < number_spatial_layers_; i++) {
+        layer_id.temporal_layer_id_per_spatial[i] = temporal_layer_id_;
+        ref_frame_config_.duration[i] = 1;
+      }
+      encoder->Control(VP9E_SET_SVC_LAYER_ID, &layer_id);
+      set_frame_flags_bypass_mode(layer_id.temporal_layer_id,
+                                  number_spatial_layers_, 0,
+                                  &ref_frame_config_);
+      encoder->Control(VP9E_SET_SVC_REF_FRAME_CONFIG, &ref_frame_config_);
+    }
     if (video->frame() == frame_to_sync_) {
       encoder->Control(VP9E_SET_SVC_SPATIAL_LAYER_SYNC, &svc_layer_sync_);
     }
@@ -171,9 +272,14 @@
         decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
                          decode_to_layer_before_sync_);
     } else {
-      if (decode_to_layer_after_sync_ >= 0)
-        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER,
-                         decode_to_layer_after_sync_);
+      if (decode_to_layer_after_sync_ >= 0) {
+        int decode_to_layer = decode_to_layer_after_sync_;
+        // Overlay frame is additional layer for intra-only.
+        if (video->frame() == frame_to_sync_ && intra_only_test_ &&
+            decode_to_layer_after_sync_ == 0 && number_spatial_layers_ > 1)
+          decode_to_layer += 1;
+        decoder->Control(VP9_DECODE_SVC_SPATIAL_LAYER, decode_to_layer);
+      }
     }
   }
 #endif
@@ -221,6 +327,8 @@
   vpx_svc_spatial_layer_sync_t svc_layer_sync_;
   unsigned int mismatch_nframes_;
   unsigned int num_nonref_frames_;
+  bool flexible_mode_;
+  vpx_svc_ref_frame_config_t ref_frame_config_;
 
  private:
   virtual void SetConfig(const int num_temporal_layer) {
@@ -246,7 +354,7 @@
       cfg_.temporal_layering_mode = 2;
     } else if (num_temporal_layer == 1) {
       cfg_.ts_rate_decimator[0] = 1;
-      cfg_.temporal_layering_mode = 1;
+      cfg_.temporal_layering_mode = 0;
     }
   }
 };
@@ -270,6 +378,7 @@
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
 
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -297,6 +406,7 @@
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -324,6 +434,7 @@
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -351,6 +462,7 @@
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -380,6 +492,7 @@
   ::libvpx_test::I420VideoSource video("niklas_640_480_30.yuv", 640, 480, 30, 1,
                                        0, 400);
   cfg_.rc_target_bitrate = 400;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -390,6 +503,61 @@
 }
 #endif
 
+// Encode 3 spatial, 2 temporal layer in flexible mode but don't
+// start decoding. During the sequence insert intra-only on base/qvga
+// layer at frame 20 and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGAFlex) {
+  SetSvcConfig(3, 2);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = true;
+  AssignLayerBitrates();
+  cfg_.temporal_layering_mode = VP9E_TEMPORAL_LAYERING_MODE_BYPASS;
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
+// Encode 3 spatial, 3 temporal layer but don't start decoding.
+// During the sequence insert intra-only on base/qvga layer at frame 20
+// and start decoding only QVGA layer from there.
+TEST_P(SyncFrameOnePassCbrSvc,
+       OnePassCbrSvc3SL3TLSyncFrameStartDecodeOnIntraOnlyQVGA) {
+  SetSvcConfig(3, 3);
+  frame_to_start_decode_ = 20;
+  frame_to_sync_ = 20;
+  decode_to_layer_before_sync_ = 2;
+  decode_to_layer_after_sync_ = 0;
+  intra_only_test_ = true;
+
+  // Set up svc layer sync structure.
+  svc_layer_sync_.base_layer_intra_only = 1;
+  svc_layer_sync_.spatial_layer_sync[0] = 1;
+  svc_layer_sync_.spatial_layer_sync[1] = 0;
+  svc_layer_sync_.spatial_layer_sync[2] = 0;
+
+  ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
+  cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
+  AssignLayerBitrates();
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  // Can't check mismatch here because only base is decoded at
+  // frame sync, whereas encoder continues encoding all layers.
+}
+
 // Start decoding from beginning of sequence, during sequence insert intra-only
 // on base/qvga layer. Decode all layers.
 TEST_P(SyncFrameOnePassCbrSvc, OnePassCbrSvc3SL3TLSyncFrameIntraOnlyQVGA) {
@@ -397,8 +565,9 @@
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 
@@ -410,6 +579,7 @@
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -426,8 +596,9 @@
   frame_to_start_decode_ = 0;
   frame_to_sync_ = 20;
   decode_to_layer_before_sync_ = 2;
-  // The superframe containing intra-only layer will have 4 frames. Thus set the
-  // layer to decode after sync frame to 3.
+  // The superframe containing intra-only layer will have +1 frames. Thus set
+  // the layer to decode after sync frame to +1 from
+  // decode_to_layer_before_sync.
   decode_to_layer_after_sync_ = 3;
   intra_only_test_ = true;
 
@@ -439,6 +610,7 @@
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
@@ -464,6 +636,7 @@
 
   ::libvpx_test::Y4mVideoSource video("niklas_1280_720_30.y4m", 0, 60);
   cfg_.rc_target_bitrate = 600;
+  flexible_mode_ = false;
   AssignLayerBitrates();
   ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
 #if CONFIG_VP9_DECODER
diff --git a/libvpx/test/test-data.mk b/libvpx/test/test-data.mk
index 46fe359..62a9d6e 100644
--- a/libvpx/test/test-data.mk
+++ b/libvpx/test/test-data.mk
@@ -6,6 +6,7 @@
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += hantro_odd.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktop_office1.1280_720-020.yuv
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += slides_code_term_web_plot.1920_1080.yuv
+LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += desktopqvga.320_240.yuv
 
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_420_20f.y4m
 LIBVPX_TEST_DATA-$(CONFIG_ENCODERS) += park_joy_90p_10_422_20f.y4m
diff --git a/libvpx/test/test-data.sha1 b/libvpx/test/test-data.sha1
index 668992f..55f92a2 100644
--- a/libvpx/test/test-data.sha1
+++ b/libvpx/test/test-data.sha1
@@ -869,3 +869,4 @@
 518a0be998afece76d3df76047d51e256c591ff2 *invalid-bug-148271109.ivf
 d3964f9dad9f60363c81b688324d95b4ec7c8038 *invalid-bug-148271109.ivf.res
 ad18ca16f0a249fb3b7c38de0d9b327fed273f96 *hantro_collage_w352h288_nv12.yuv
+8a0b2c350539859463d3546a67876c83ff6ff0ac *desktopqvga.320_240.yuv
diff --git a/libvpx/test/test.mk b/libvpx/test/test.mk
index 11228ec..6df4572 100644
--- a/libvpx/test/test.mk
+++ b/libvpx/test/test.mk
@@ -156,6 +156,7 @@
 LIBVPX_TEST_SRCS-yes                   += tile_independence_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_boolcoder_test.cc
 LIBVPX_TEST_SRCS-yes                   += vp9_encoder_parms_get_to_decoder.cc
+LIBVPX_TEST_SRCS-yes                   += vp9_roi_test.cc
 endif
 
 LIBVPX_TEST_SRCS-yes                   += convolve_test.cc
@@ -213,9 +214,11 @@
 TEST_INTRA_PRED_SPEED_SRCS-yes := test_intra_pred_speed.cc
 TEST_INTRA_PRED_SPEED_SRCS-yes += ../md5_utils.h ../md5_utils.c
 
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) := ratectrl_rtc_test.cc
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.cc
-RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += encode_test_driver.h
+RC_INTERFACE_TEST_SRCS-yes := test_rc_interface.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP9_ENCODER) += vp9_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_VP8_ENCODER) += vp8_ratectrl_rtc_test.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.cc
+RC_INTERFACE_TEST_SRCS-$(CONFIG_ENCODERS) += encode_test_driver.h
 RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.cc
 RC_INTERFACE_TEST_SRCS-yes += decode_test_driver.h
 RC_INTERFACE_TEST_SRCS-yes += codec_factory.h
diff --git a/libvpx/test/test_intra_pred_speed.cc b/libvpx/test/test_intra_pred_speed.cc
index 08100a1..28b3484 100644
--- a/libvpx/test/test_intra_pred_speed.cc
+++ b/libvpx/test/test_intra_pred_speed.cc
@@ -48,11 +48,9 @@
     for (int i = 0; i < kBPS; ++i) left[i] = rnd.Rand16() & mask;
     for (int i = -1; i < kBPS; ++i) above[i] = rnd.Rand16() & mask;
 
-    // some code assumes the top row has been extended:
-    // d45/d63 C-code, for instance, but not the assembly.
-    // TODO(jzern): this style of extension isn't strictly necessary.
+    // d45/d63 require the top row to be extended.
     ASSERT_LE(block_size, kBPS);
-    for (int i = block_size; i < 2 * kBPS; ++i) {
+    for (int i = block_size; i < 2 * block_size; ++i) {
       above[i] = above[block_size - 1];
     }
   }
diff --git a/libvpx/test/test_rc_interface.cc b/libvpx/test/test_rc_interface.cc
new file mode 100644
index 0000000..ec75700
--- /dev/null
+++ b/libvpx/test/test_rc_interface.cc
@@ -0,0 +1,6 @@
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/libvpx/test/variance_test.cc b/libvpx/test/variance_test.cc
index 1b76b20..8085505 100644
--- a/libvpx/test/variance_test.cc
+++ b/libvpx/test/variance_test.cc
@@ -596,6 +596,7 @@
  protected:
   void RefTest();
   void ExtremeRefTest();
+  void SpeedTest();
 
   ACMRandom rnd_;
   uint8_t *src_;
@@ -681,6 +682,37 @@
   }
 }
 
+template <typename SubpelVarianceFunctionType>
+void SubpelVarianceTest<SubpelVarianceFunctionType>::SpeedTest() {
+  // The only interesting points are 0, 4, and anything else. To make the loops
+  // simple we will use 0, 2 and 4.
+  for (int x = 0; x <= 4; x += 2) {
+    for (int y = 0; y <= 4; y += 2) {
+      if (!use_high_bit_depth()) {
+        memset(src_, 25, block_size());
+        memset(ref_, 50, block_size());
+#if CONFIG_VP9_HIGHBITDEPTH
+      } else {
+        vpx_memset16(CONVERT_TO_SHORTPTR(src_), 25, block_size());
+        vpx_memset16(CONVERT_TO_SHORTPTR(ref_), 50, block_size());
+#endif  // CONFIG_VP9_HIGHBITDEPTH
+      }
+      unsigned int sse;
+      vpx_usec_timer timer;
+      vpx_usec_timer_start(&timer);
+      for (int i = 0; i < 1000000000 / block_size(); ++i) {
+        const uint32_t variance =
+            params_.func(ref_, width() + 1, x, y, src_, width(), &sse);
+        (void)variance;
+      }
+      vpx_usec_timer_mark(&timer);
+      const int elapsed_time = static_cast<int>(vpx_usec_timer_elapsed(&timer));
+      printf("SubpelVariance %dx%d xoffset: %d yoffset: %d time: %5d ms\n",
+             width(), height(), x, y, elapsed_time / 1000);
+    }
+  }
+}
+
 template <>
 void SubpelVarianceTest<vpx_subp_avg_variance_fn_t>::RefTest() {
   for (int x = 0; x < 8; ++x) {
@@ -736,6 +768,7 @@
 TEST_P(SumOfSquaresTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, Ref) { RefTest(); }
 TEST_P(VpxSubpelVarianceTest, ExtremeRef) { ExtremeRefTest(); }
+TEST_P(VpxSubpelVarianceTest, DISABLED_Speed) { SpeedTest(); }
 TEST_P(VpxSubpelAvgVarianceTest, Ref) { RefTest(); }
 
 INSTANTIATE_TEST_SUITE_P(C, SumOfSquaresTest,
@@ -1616,4 +1649,27 @@
         SubpelAvgVarianceParams(2, 3, &vpx_sub_pixel_avg_variance4x8_mmi, 0),
         SubpelAvgVarianceParams(2, 2, &vpx_sub_pixel_avg_variance4x4_mmi, 0)));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VpxMseTest,
+                         ::testing::Values(MseParams(4, 4, &vpx_mse16x16_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxVarianceTest,
+    ::testing::Values(VarianceParams(6, 6, &vpx_variance64x64_lsx),
+                      VarianceParams(5, 5, &vpx_variance32x32_lsx),
+                      VarianceParams(4, 4, &vpx_variance16x16_lsx),
+                      VarianceParams(3, 3, &vpx_variance8x8_lsx)));
+
+INSTANTIATE_TEST_SUITE_P(
+    LSX, VpxSubpelVarianceTest,
+    ::testing::Values(
+        SubpelVarianceParams(3, 3, &vpx_sub_pixel_variance8x8_lsx, 0),
+        SubpelVarianceParams(4, 4, &vpx_sub_pixel_variance16x16_lsx, 0),
+        SubpelVarianceParams(5, 5, &vpx_sub_pixel_variance32x32_lsx, 0)));
+
+INSTANTIATE_TEST_SUITE_P(LSX, VpxSubpelAvgVarianceTest,
+                         ::testing::Values(SubpelAvgVarianceParams(
+                             6, 6, &vpx_sub_pixel_avg_variance64x64_lsx, 0)));
+#endif
 }  // namespace
diff --git a/libvpx/test/video_source.h b/libvpx/test/video_source.h
index e9340f2..a10ff6f 100644
--- a/libvpx/test/video_source.h
+++ b/libvpx/test/video_source.h
@@ -20,8 +20,14 @@
 #endif
 #include <cstdio>
 #include <cstdlib>
+#include <cstring>
+#include <memory>
 #include <string>
+
 #include "test/acm_random.h"
+#if !defined(_WIN32)
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#endif
 #include "vpx/vpx_encoder.h"
 
 namespace libvpx_test {
@@ -36,7 +42,7 @@
 // A simple function to encapsulate cross platform retrieval of test data path
 static std::string GetDataPath() {
   const char *const data_path = getenv("LIBVPX_TEST_DATA_PATH");
-  if (data_path == NULL) {
+  if (data_path == nullptr) {
 #ifdef LIBVPX_TEST_DATA_PATH
     // In some environments, we cannot set environment variables
     // Instead, we set the data path by using a preprocessor symbol
@@ -70,9 +76,25 @@
       return fopen(fname, "wb+");
     }
   }
-  return NULL;
+  return nullptr;
 #else
-  return tmpfile();
+  std::string temp_dir = testing::TempDir();
+  if (temp_dir.empty()) return nullptr;
+  // Versions of testing::TempDir() prior to release-1.11.0-214-g5e6a5336 may
+  // use the value of an environment variable without checking for a trailing
+  // path delimiter.
+  if (temp_dir[temp_dir.size() - 1] != '/') temp_dir += '/';
+  const char name_template[] = "libvpxtest.XXXXXX";
+  std::unique_ptr<char[]> temp_file_name(
+      new char[temp_dir.size() + sizeof(name_template)]);
+  if (temp_file_name == nullptr) return nullptr;
+  memcpy(temp_file_name.get(), temp_dir.data(), temp_dir.size());
+  memcpy(temp_file_name.get() + temp_dir.size(), name_template,
+         sizeof(name_template));
+  const int fd = mkstemp(temp_file_name.get());
+  if (fd == -1) return nullptr;
+  *file_name = temp_file_name.get();
+  return fdopen(fd, "wb+");
 #endif
 }
 
@@ -92,7 +114,7 @@
   void CloseFile() {
     if (file_) {
       fclose(file_);
-      file_ = NULL;
+      file_ = nullptr;
     }
   }
   FILE *file_;
@@ -111,7 +133,7 @@
   // Advance the cursor to the next frame
   virtual void Next() = 0;
 
-  // Get the current video frame, or NULL on End-Of-Stream.
+  // Get the current video frame, or nullptr on End-Of-Stream.
   virtual vpx_image_t *img() const = 0;
 
   // Get the presentation timestamp of the current frame.
@@ -133,7 +155,7 @@
 class DummyVideoSource : public VideoSource {
  public:
   DummyVideoSource()
-      : img_(NULL), limit_(100), width_(80), height_(64),
+      : img_(nullptr), limit_(100), width_(80), height_(64),
         format_(VPX_IMG_FMT_I420) {
     ReallocImage();
   }
@@ -150,7 +172,9 @@
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -190,8 +214,9 @@
 
   void ReallocImage() {
     vpx_img_free(img_);
-    img_ = vpx_img_alloc(NULL, format_, width_, height_, 32);
-    raw_sz_ = ((img_->w + 31) & ~31) * img_->h * img_->bps / 8;
+    img_ = vpx_img_alloc(nullptr, format_, width_, height_, 32);
+    ASSERT_NE(img_, nullptr);
+    raw_sz_ = ((img_->w + 31) & ~31u) * img_->h * img_->bps / 8;
   }
 
   vpx_image_t *img_;
diff --git a/libvpx/test/vp8_fdct4x4_test.cc b/libvpx/test/vp8_fdct4x4_test.cc
index d5ac253..1b73a72 100644
--- a/libvpx/test/vp8_fdct4x4_test.cc
+++ b/libvpx/test/vp8_fdct4x4_test.cc
@@ -148,7 +148,7 @@
 
   EXPECT_EQ(true, bias_acceptable)
       << "Error: 4x4 FDCT has a sign bias > 10% for input range [-15, 15]";
-};
+}
 
 TEST_P(FdctTest, RoundTripErrorCheck) {
   int max_error = 0;
@@ -181,7 +181,7 @@
 
   EXPECT_GE(count_test_block, total_error)
       << "Error: FDCT/IDCT has average roundtrip error > 1 per block";
-};
+}
 
 INSTANTIATE_TEST_SUITE_P(C, FdctTest, ::testing::Values(vp8_short_fdct4x4_c));
 
@@ -203,4 +203,9 @@
 INSTANTIATE_TEST_SUITE_P(MMI, FdctTest,
                          ::testing::Values(vp8_short_fdct4x4_mmi));
 #endif  // HAVE_MMI
+
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, FdctTest,
+                         ::testing::Values(vp8_short_fdct4x4_lsx));
+#endif  // HAVE_LSX
 }  // namespace
diff --git a/libvpx/test/vp8_ratectrl_rtc_test.cc b/libvpx/test/vp8_ratectrl_rtc_test.cc
new file mode 100644
index 0000000..ad31066
--- /dev/null
+++ b/libvpx/test/vp8_ratectrl_rtc_test.cc
@@ -0,0 +1,343 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <fstream>  // NOLINT
+#include <string>
+
+#include "./vpx_config.h"
+#include "third_party/googletest/src/include/gtest/gtest.h"
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vpx/vpx_codec.h"
+#include "vpx_ports/bitops.h"
+
+namespace {
+
+struct Vp8RCTestVideo {
+  Vp8RCTestVideo() {}
+  Vp8RCTestVideo(const char *name_, int width_, int height_,
+                 unsigned int frames_)
+      : name(name_), width(width_), height(height_), frames(frames_) {}
+
+  friend std::ostream &operator<<(std::ostream &os,
+                                  const Vp8RCTestVideo &video) {
+    os << video.name << " " << video.width << " " << video.height << " "
+       << video.frames;
+    return os;
+  }
+  const char *name;
+  int width;
+  int height;
+  unsigned int frames;
+};
+
+const Vp8RCTestVideo kVp8RCTestVectors[] = {
+  Vp8RCTestVideo("niklas_640_480_30.yuv", 640, 480, 470),
+  Vp8RCTestVideo("desktop_office1.1280_720-020.yuv", 1280, 720, 300),
+};
+
+class Vp8RcInterfaceTest
+    : public ::libvpx_test::EncoderTest,
+      public ::libvpx_test::CodecTestWith2Params<int, Vp8RCTestVideo> {
+ public:
+  Vp8RcInterfaceTest()
+      : EncoderTest(GET_PARAM(0)), key_interval_(3000), encoder_exit_(false) {}
+  virtual ~Vp8RcInterfaceTest() {}
+
+ protected:
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+  }
+
+  // From error_resilience_test.cc
+  int SetFrameFlags(int frame_num, int num_temp_layers) {
+    int frame_flags = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        // Layer 0: predict from L and ARF, update L.
+        frame_flags =
+            VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF;
+      } else {
+        // Layer 1: predict from L, G and ARF, and update G.
+        frame_flags = VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST |
+                      VP8_EFLAG_NO_UPD_ENTROPY;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        // Layer 0: predict from L, update L.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_ARF |
+                      VP8_EFLAG_NO_REF_GF | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 2) % 4 == 0) {
+        // Layer 1: predict from L, G,  update G.
+        frame_flags =
+            VP8_EFLAG_NO_UPD_ARF | VP8_EFLAG_NO_UPD_LAST | VP8_EFLAG_NO_REF_ARF;
+      } else if ((frame_num - 1) % 2 == 0) {
+        // Layer 2: predict from L, G, ARF; update ARG.
+        frame_flags = VP8_EFLAG_NO_UPD_GF | VP8_EFLAG_NO_UPD_LAST;
+      }
+    }
+    return frame_flags;
+  }
+
+  int SetLayerId(int frame_num, int num_temp_layers) {
+    int layer_id = 0;
+    if (num_temp_layers == 2) {
+      if (frame_num % 2 == 0) {
+        layer_id = 0;
+      } else {
+        layer_id = 1;
+      }
+    } else if (num_temp_layers == 3) {
+      if (frame_num % 4 == 0) {
+        layer_id = 0;
+      } else if ((frame_num - 2) % 4 == 0) {
+        layer_id = 1;
+      } else if ((frame_num - 1) % 2 == 0) {
+        layer_id = 2;
+      }
+    }
+    return layer_id;
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (rc_cfg_.ts_number_layers > 1) {
+      const int layer_id = SetLayerId(video->frame(), cfg_.ts_number_layers);
+      const int frame_flags =
+          SetFrameFlags(video->frame(), cfg_.ts_number_layers);
+      frame_params_.temporal_layer_id = layer_id;
+      if (video->frame() > 0) {
+        encoder->Control(VP8E_SET_TEMPORAL_LAYER_ID, layer_id);
+        encoder->Control(VP8E_SET_FRAME_FLAGS, frame_flags);
+      }
+    } else {
+      if (video->frame() == 0) {
+        encoder->Control(VP8E_SET_CPUUSED, -6);
+        encoder->Control(VP8E_SET_RTC_EXTERNAL_RATECTRL, 1);
+        encoder->Control(VP8E_SET_MAX_INTRA_BITRATE_PCT, 1000);
+      }
+      if (frame_params_.frame_type == INTER_FRAME) {
+        // Disable golden frame update.
+        frame_flags_ |= VP8_EFLAG_NO_UPD_GF;
+        frame_flags_ |= VP8_EFLAG_NO_UPD_ARF;
+      }
+    }
+    frame_params_.frame_type =
+        video->frame() % key_interval_ == 0 ? KEY_FRAME : INTER_FRAME;
+    encoder_exit_ = video->frame() == test_video_.frames;
+  }
+
+  virtual void PostEncodeFrameHook(::libvpx_test::Encoder *encoder) {
+    if (encoder_exit_) {
+      return;
+    }
+    int qp;
+    encoder->Control(VP8E_GET_LAST_QUANTIZER, &qp);
+    rc_api_->ComputeQP(frame_params_);
+    ASSERT_EQ(rc_api_->GetQP(), qp);
+  }
+
+  virtual void FramePktHook(const vpx_codec_cx_pkt_t *pkt) {
+    rc_api_->PostEncodeUpdate(pkt->data.frame.sz);
+  }
+
+  void RunOneLayer() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunPeriodicKey() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    key_interval_ = 100;
+    SetConfig();
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers2TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(2);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+  void RunTemporalLayers3TL() {
+    test_video_ = GET_PARAM(2);
+    target_bitrate_ = GET_PARAM(1);
+    if (test_video_.width == 1280 && target_bitrate_ == 200) return;
+    if (test_video_.width == 640 && target_bitrate_ == 1000) return;
+    SetConfigTemporalLayers(3);
+    rc_api_ = libvpx::VP8RateControlRTC::Create(rc_cfg_);
+    rc_api_->UpdateRateControl(rc_cfg_);
+
+    ::libvpx_test::I420VideoSource video(test_video_.name, test_video_.width,
+                                         test_video_.height, 30, 1, 0,
+                                         test_video_.frames);
+
+    ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+  }
+
+ private:
+  void SetConfig() {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    rc_cfg_.layer_target_bitrate[0] = target_bitrate_;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+  }
+
+  void SetConfigTemporalLayers(int temporal_layers) {
+    rc_cfg_.width = test_video_.width;
+    rc_cfg_.height = test_video_.height;
+    rc_cfg_.max_quantizer = 60;
+    rc_cfg_.min_quantizer = 2;
+    rc_cfg_.target_bandwidth = target_bitrate_;
+    rc_cfg_.buf_initial_sz = 600;
+    rc_cfg_.buf_optimal_sz = 600;
+    rc_cfg_.buf_sz = target_bitrate_;
+    rc_cfg_.undershoot_pct = 50;
+    rc_cfg_.overshoot_pct = 50;
+    rc_cfg_.max_intra_bitrate_pct = 1000;
+    rc_cfg_.framerate = 30.0;
+    if (temporal_layers == 2) {
+      rc_cfg_.layer_target_bitrate[0] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 2;
+      rc_cfg_.ts_rate_decimator[1] = 1;
+    } else if (temporal_layers == 3) {
+      rc_cfg_.layer_target_bitrate[0] = 40 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[1] = 60 * target_bitrate_ / 100;
+      rc_cfg_.layer_target_bitrate[2] = target_bitrate_;
+      rc_cfg_.ts_rate_decimator[0] = 4;
+      rc_cfg_.ts_rate_decimator[1] = 2;
+      rc_cfg_.ts_rate_decimator[2] = 1;
+    }
+
+    rc_cfg_.ts_number_layers = temporal_layers;
+
+    // Encoder settings for ground truth.
+    cfg_.g_w = test_video_.width;
+    cfg_.g_h = test_video_.height;
+    cfg_.rc_undershoot_pct = 50;
+    cfg_.rc_overshoot_pct = 50;
+    cfg_.rc_buf_initial_sz = 600;
+    cfg_.rc_buf_optimal_sz = 600;
+    cfg_.rc_buf_sz = target_bitrate_;
+    cfg_.rc_dropframe_thresh = 0;
+    cfg_.rc_min_quantizer = 2;
+    cfg_.rc_max_quantizer = 60;
+    cfg_.rc_end_usage = VPX_CBR;
+    cfg_.g_lag_in_frames = 0;
+    cfg_.g_error_resilient = 1;
+    cfg_.rc_target_bitrate = target_bitrate_;
+    cfg_.kf_min_dist = key_interval_;
+    cfg_.kf_max_dist = key_interval_;
+    // 2 Temporal layers, no spatial layers, CBR mode.
+    cfg_.ss_number_layers = 1;
+    cfg_.ts_number_layers = temporal_layers;
+    if (temporal_layers == 2) {
+      cfg_.ts_rate_decimator[0] = 2;
+      cfg_.ts_rate_decimator[1] = 1;
+      cfg_.ts_periodicity = 2;
+      cfg_.ts_target_bitrate[0] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = cfg_.rc_target_bitrate;
+    } else if (temporal_layers == 3) {
+      cfg_.ts_rate_decimator[0] = 4;
+      cfg_.ts_rate_decimator[1] = 2;
+      cfg_.ts_rate_decimator[2] = 1;
+      cfg_.ts_periodicity = 4;
+      cfg_.ts_target_bitrate[0] = 40 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[1] = 60 * cfg_.rc_target_bitrate / 100;
+      cfg_.ts_target_bitrate[2] = cfg_.rc_target_bitrate;
+    }
+  }
+
+  std::unique_ptr<libvpx::VP8RateControlRTC> rc_api_;
+  libvpx::VP8RateControlRtcConfig rc_cfg_;
+  int key_interval_;
+  int target_bitrate_;
+  Vp8RCTestVideo test_video_;
+  libvpx::VP8FrameParamsQpRTC frame_params_;
+  bool encoder_exit_;
+};
+
+TEST_P(Vp8RcInterfaceTest, OneLayer) { RunOneLayer(); }
+
+TEST_P(Vp8RcInterfaceTest, OneLayerPeriodicKey) { RunPeriodicKey(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers2TL) { RunTemporalLayers2TL(); }
+
+TEST_P(Vp8RcInterfaceTest, TemporalLayers3TL) { RunTemporalLayers3TL(); }
+
+VP8_INSTANTIATE_TEST_SUITE(Vp8RcInterfaceTest,
+                           ::testing::Values(200, 400, 1000),
+                           ::testing::ValuesIn(kVp8RCTestVectors));
+
+}  // namespace
diff --git a/libvpx/test/vp9_ethread_test.cc b/libvpx/test/vp9_ethread_test.cc
index 21caf79..238366c 100644
--- a/libvpx/test/vp9_ethread_test.cc
+++ b/libvpx/test/vp9_ethread_test.cc
@@ -98,6 +98,7 @@
 
     firstpass_stats_.buf =
         realloc(firstpass_stats_.buf, firstpass_stats_.sz + pkt_size);
+    ASSERT_NE(firstpass_stats_.buf, nullptr);
     memcpy((uint8_t *)firstpass_stats_.buf + firstpass_stats_.sz, pkt_buf,
            pkt_size);
     firstpass_stats_.sz += pkt_size;
diff --git a/libvpx/test/vp9_quantize_test.cc b/libvpx/test/vp9_quantize_test.cc
index cb4481b..ca1062a 100644
--- a/libvpx/test/vp9_quantize_test.cc
+++ b/libvpx/test/vp9_quantize_test.cc
@@ -38,26 +38,24 @@
 const int number_of_iterations = 100;
 
 typedef void (*QuantizeFunc)(const tran_low_t *coeff, intptr_t count,
-                             int skip_block, const int16_t *zbin,
-                             const int16_t *round, const int16_t *quant,
-                             const int16_t *quant_shift, tran_low_t *qcoeff,
-                             tran_low_t *dqcoeff, const int16_t *dequant,
-                             uint16_t *eob, const int16_t *scan,
-                             const int16_t *iscan);
+                             const int16_t *zbin, const int16_t *round,
+                             const int16_t *quant, const int16_t *quant_shift,
+                             tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                             const int16_t *dequant, uint16_t *eob,
+                             const int16_t *scan, const int16_t *iscan);
 typedef std::tuple<QuantizeFunc, QuantizeFunc, vpx_bit_depth_t,
                    int /*max_size*/, bool /*is_fp*/>
     QuantizeParam;
 
 // Wrapper for FP version which does not use zbin or quant_shift.
 typedef void (*QuantizeFPFunc)(const tran_low_t *coeff, intptr_t count,
-                               int skip_block, const int16_t *round,
-                               const int16_t *quant, tran_low_t *qcoeff,
-                               tran_low_t *dqcoeff, const int16_t *dequant,
-                               uint16_t *eob, const int16_t *scan,
-                               const int16_t *iscan);
+                               const int16_t *round, const int16_t *quant,
+                               tran_low_t *qcoeff, tran_low_t *dqcoeff,
+                               const int16_t *dequant, uint16_t *eob,
+                               const int16_t *scan, const int16_t *iscan);
 
 template <QuantizeFPFunc fn>
-void QuantFPWrapper(const tran_low_t *coeff, intptr_t count, int skip_block,
+void QuantFPWrapper(const tran_low_t *coeff, intptr_t count,
                     const int16_t *zbin, const int16_t *round,
                     const int16_t *quant, const int16_t *quant_shift,
                     tran_low_t *qcoeff, tran_low_t *dqcoeff,
@@ -66,8 +64,7 @@
   (void)zbin;
   (void)quant_shift;
 
-  fn(coeff, count, skip_block, round, quant, qcoeff, dqcoeff, dequant, eob,
-     scan, iscan);
+  fn(coeff, count, round, quant, qcoeff, dqcoeff, dequant, eob, scan, iscan);
 }
 
 class VP9QuantizeBase : public AbstractBench {
@@ -138,7 +135,6 @@
   int16_t *r_ptr_;
   int16_t *q_ptr_;
   int count_;
-  int skip_block_;
   const scan_order *scan_;
   uint16_t eob_;
 };
@@ -157,8 +153,8 @@
 };
 
 void VP9QuantizeTest::Run() {
-  quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_,
-               q_ptr_, quant_shift_ptr_, qcoeff_.TopLeftPixel(),
+  quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+               quant_shift_ptr_, qcoeff_.TopLeftPixel(),
                dqcoeff_.TopLeftPixel(), dequant_ptr_, &eob_, scan_->scan,
                scan_->iscan);
 }
@@ -167,16 +163,14 @@
 // determine if further multiplication operations are needed.
 // Based on vp9_quantize_fp_sse2().
 inline void quant_fp_nz(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *round_ptr,
-                        const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                        tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                        uint16_t *eob_ptr, const int16_t *scan,
-                        const int16_t *iscan, int is_32x32) {
+                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                        const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                        const int16_t *scan, const int16_t *iscan,
+                        int is_32x32) {
   int i, eob = -1;
   const int thr = dequant_ptr[1] >> (1 + is_32x32);
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
@@ -243,22 +237,20 @@
 }
 
 void quantize_fp_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *round_ptr,
-                      const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+                      const int16_t *round_ptr, const int16_t *quant_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 0);
 }
 
 void quantize_fp_32x32_nz_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                            uint16_t *eob_ptr, const int16_t *scan,
-                            const int16_t *iscan) {
-  quant_fp_nz(coeff_ptr, n_coeffs, skip_block, round_ptr, quant_ptr, qcoeff_ptr,
+                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                            const int16_t *scan, const int16_t *iscan) {
+  quant_fp_nz(coeff_ptr, n_coeffs, round_ptr, quant_ptr, qcoeff_ptr,
               dqcoeff_ptr, dequant_ptr, eob_ptr, scan, iscan, 1);
 }
 
@@ -316,9 +308,6 @@
   eob_ = 0;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    // Test skip block for the first three iterations to catch all the different
-    // sizes.
-    const int skip_block = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -332,13 +321,13 @@
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block, zbin_ptr_, r_ptr_, q_ptr_,
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
         quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
         dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
@@ -372,7 +361,6 @@
   const uint32_t max_index = max_size_ * max_size_ - 1;
 
   for (int i = 0; i < number_of_iterations; ++i) {
-    skip_block_ = 0;
     TX_SIZE sz;
     if (max_size_ == 16) {
       sz = static_cast<TX_SIZE>(i % 3);  // TX_4X4, TX_8X8 TX_16X16
@@ -391,13 +379,13 @@
     GenerateHelperArrays(&rnd, zbin_ptr_, round_ptr_, quant_ptr_,
                          quant_shift_ptr_, dequant_ptr_, round_fp_ptr_,
                          quant_fp_ptr_);
-    ref_quantize_op_(coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_,
-                     r_ptr_, q_ptr_, quant_shift_ptr_,
-                     ref_qcoeff.TopLeftPixel(), ref_dqcoeff.TopLeftPixel(),
-                     dequant_ptr_, &ref_eob, scan_->scan, scan_->iscan);
+    ref_quantize_op_(coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
+                     quant_shift_ptr_, ref_qcoeff.TopLeftPixel(),
+                     ref_dqcoeff.TopLeftPixel(), dequant_ptr_, &ref_eob,
+                     scan_->scan, scan_->iscan);
 
     ASM_REGISTER_STATE_CHECK(quantize_op_(
-        coeff_.TopLeftPixel(), count_, skip_block_, zbin_ptr_, r_ptr_, q_ptr_,
+        coeff_.TopLeftPixel(), count_, zbin_ptr_, r_ptr_, q_ptr_,
         quant_shift_ptr_, qcoeff_.TopLeftPixel(), dqcoeff_.TopLeftPixel(),
         dequant_ptr_, &eob_, scan_->scan, scan_->iscan));
 
@@ -433,7 +421,6 @@
   for (TX_SIZE sz = starting_sz; sz <= ending_sz; ++sz) {
     // zbin > coeff, zbin < coeff.
     for (int i = 0; i < 2; ++i) {
-      skip_block_ = 0;
       // TX_TYPE defines the scan order. That is not relevant to the speed test.
       // Pick the first one.
       const TX_TYPE tx_type = DCT_DCT;
@@ -581,6 +568,16 @@
                                  VPX_BITS_8, 32, true)));
 #endif  // HAVE_VSX && !CONFIG_VP9_HIGHBITDEPTH
 
+#if HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+INSTANTIATE_TEST_SUITE_P(LSX, VP9QuantizeTest,
+                         ::testing::Values(make_tuple(&vpx_quantize_b_lsx,
+                                                      &vpx_quantize_b_c,
+                                                      VPX_BITS_8, 16, false),
+                                           make_tuple(&vpx_quantize_b_32x32_lsx,
+                                                      &vpx_quantize_b_32x32_c,
+                                                      VPX_BITS_8, 32, false)));
+#endif  // HAVE_LSX && !CONFIG_VP9_HIGHBITDEPTH
+
 // Only useful to compare "Speed" test results.
 INSTANTIATE_TEST_SUITE_P(
     DISABLED_C, VP9QuantizeTest,
diff --git a/libvpx/test/ratectrl_rtc_test.cc b/libvpx/test/vp9_ratectrl_rtc_test.cc
similarity index 99%
rename from libvpx/test/ratectrl_rtc_test.cc
rename to libvpx/test/vp9_ratectrl_rtc_test.cc
index 8136bd8..b09a45b 100644
--- a/libvpx/test/ratectrl_rtc_test.cc
+++ b/libvpx/test/vp9_ratectrl_rtc_test.cc
@@ -270,6 +270,7 @@
     for (int i = 0; i < VPX_MAX_LAYERS; ++i) {
       svc_params_.max_quantizers[i] = 56;
       svc_params_.min_quantizers[i] = 2;
+      svc_params_.speed_per_layer[i] = 7;
     }
     cfg_.rc_end_usage = VPX_CBR;
     cfg_.g_lag_in_frames = 0;
@@ -318,6 +319,7 @@
     rc_cfg_.ss_number_layers = 3;
     rc_cfg_.ts_number_layers = 3;
     rc_cfg_.rc_mode = VPX_CBR;
+    rc_cfg_.aq_mode = aq_mode_;
 
     rc_cfg_.scaling_factor_num[0] = 1;
     rc_cfg_.scaling_factor_den[0] = 4;
@@ -367,10 +369,5 @@
 
 VP9_INSTANTIATE_TEST_SUITE(RcInterfaceTest, ::testing::Values(0, 3),
                            ::testing::Values(VPX_CBR, VPX_VBR));
-VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0));
+VP9_INSTANTIATE_TEST_SUITE(RcInterfaceSvcTest, ::testing::Values(0, 3));
 }  // namespace
-
-int main(int argc, char **argv) {
-  ::testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/libvpx/test/vp9_roi_test.cc b/libvpx/test/vp9_roi_test.cc
new file mode 100644
index 0000000..e8373c4
--- /dev/null
+++ b/libvpx/test/vp9_roi_test.cc
@@ -0,0 +1,148 @@
+/*
+ *  Copyright (c) 2014 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <cstdint>
+#include <cstdlib>
+#include <cstring>
+
+#include "third_party/googletest/src/include/gtest/gtest.h"
+
+#include "test/codec_factory.h"
+#include "test/encode_test_driver.h"
+#include "test/i420_video_source.h"
+#include "test/util.h"
+#include "test/video_source.h"
+#include "test/y4m_video_source.h"
+#include "test/yuv_video_source.h"
+#include "vpx/vp8cx.h"
+#include "vpx/vpx_encoder.h"
+
+#define MASK_WIDTH 40
+#define MASK_HEIGHT 30
+#define MASK_SIZE MASK_WIDTH *MASK_HEIGHT
+
+namespace {
+
+const int mask[MASK_SIZE] = {
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
+  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0
+};
+
+class RoiMaskBackgroundSkip : public ::libvpx_test::EncoderTest,
+                              public ::testing::Test {
+ protected:
+  RoiMaskBackgroundSkip() : EncoderTest(&::libvpx_test::kVP9) {}
+  virtual ~RoiMaskBackgroundSkip() { free(roi_.roi_map); }
+
+  virtual void SetUp() {
+    InitializeConfig();
+    SetMode(::libvpx_test::kRealTime);
+    SetRoi();
+  }
+
+  void SetRoi() {
+    const int block_size = 8;
+    unsigned int i, j;
+    roi_.rows = (cfg_.g_h + block_size - 1) / block_size;
+    roi_.cols = (cfg_.g_w + block_size - 1) / block_size;
+    memset(&roi_.skip, 0, sizeof(roi_.skip));
+    memset(&roi_.delta_q, 0, sizeof(roi_.delta_q));
+    memset(&roi_.delta_lf, 0, sizeof(roi_.delta_lf));
+    memset(roi_.ref_frame, -1, sizeof(roi_.ref_frame));
+    roi_.ref_frame[1] = 1;
+    // Use segment 3 for skip.
+    roi_.skip[3] = 1;
+    roi_.roi_map =
+        (uint8_t *)calloc(roi_.rows * roi_.cols, sizeof(*roi_.roi_map));
+    for (i = 0; i < roi_.rows; ++i) {
+      for (j = 0; j < roi_.cols; ++j) {
+        const int idx = i * roi_.cols + j;
+        if (mask[idx] == 1) roi_.roi_map[idx] = 3;
+      }
+    }
+  }
+
+  virtual void PreEncodeFrameHook(::libvpx_test::VideoSource *video,
+                                  ::libvpx_test::Encoder *encoder) {
+    if (video->frame() == 0) {
+      encoder->Control(VP8E_SET_CPUUSED, 7);
+      encoder->Control(VP9E_SET_AQ_MODE, 3);
+    }
+    encoder->Control(VP9E_SET_ROI_MAP, &roi_);
+  }
+
+ private:
+  vpx_roi_map_t roi_;
+};
+
+TEST_F(RoiMaskBackgroundSkip, RoiMaskNoMismatch) {
+  cfg_.rc_buf_initial_sz = 500;
+  cfg_.rc_buf_optimal_sz = 500;
+  cfg_.rc_buf_sz = 1000;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_undershoot_pct = 20;
+  cfg_.rc_dropframe_thresh = 10;
+  cfg_.rc_min_quantizer = 0;
+  cfg_.rc_max_quantizer = 50;
+  cfg_.rc_end_usage = VPX_CBR;
+  cfg_.rc_target_bitrate = 200;
+  cfg_.g_lag_in_frames = 0;
+  cfg_.kf_max_dist = 9999;
+
+  ::libvpx_test::I420VideoSource video("desktopqvga.320_240.yuv", 320, 240, 30,
+                                       1, 0, 150);
+  ASSERT_NO_FATAL_FAILURE(RunLoop(&video));
+}
+}  // namespace
diff --git a/libvpx/test/vp9_subtract_test.cc b/libvpx/test/vp9_subtract_test.cc
index ef8cc20..211cc6c 100644
--- a/libvpx/test/vp9_subtract_test.cc
+++ b/libvpx/test/vp9_subtract_test.cc
@@ -152,4 +152,9 @@
                          ::testing::Values(vpx_subtract_block_vsx));
 #endif
 
+#if HAVE_LSX
+INSTANTIATE_TEST_SUITE_P(LSX, VP9SubtractBlockTest,
+                         ::testing::Values(vpx_subtract_block_lsx));
+#endif
+
 }  // namespace vp9
diff --git a/libvpx/test/vp9_thread_test.cc b/libvpx/test/vp9_thread_test.cc
index 352ad71..1ceef81 100644
--- a/libvpx/test/vp9_thread_test.cc
+++ b/libvpx/test/vp9_thread_test.cc
@@ -148,11 +148,6 @@
 // -----------------------------------------------------------------------------
 // Multi-threaded decode tests
 #if CONFIG_WEBM_IO
-struct FileList {
-  const char *name;
-  const char *expected_md5;
-};
-
 // Decodes |filename| with |num_threads|. Returns the md5 of the decoded frames.
 string DecodeFile(const string &filename, int num_threads) {
   libvpx_test::WebMVideoSource video(filename);
@@ -182,16 +177,6 @@
   return string(md5.Get());
 }
 
-void DecodeFiles(const FileList files[]) {
-  for (const FileList *iter = files; iter->name != nullptr; ++iter) {
-    SCOPED_TRACE(iter->name);
-    for (int t = 1; t <= 8; ++t) {
-      EXPECT_EQ(iter->expected_md5, DecodeFile(iter->name, t))
-          << "threads = " << t;
-    }
-  }
-}
-
 // Trivial serialized thread worker interface implementation.
 // Note any worker that requires synchronization between other workers will
 // hang.
@@ -216,10 +201,6 @@
   static const VPxWorkerInterface serial_interface = {
     impl::Init, impl::Reset, impl::Sync, impl::Launch, impl::Execute, impl::End
   };
-  // TODO(jzern): Avoid using a file that will use the row-based thread
-  // loopfilter, with the simple serialized implementation it will hang. This is
-  // due to its expectation that rows will be run in parallel as they wait on
-  // progress in the row above before proceeding.
   static const char expected_md5[] = "b35a1b707b28e82be025d960aba039bc";
   static const char filename[] = "vp90-2-03-size-226x226.webm";
   VPxWorkerInterface default_interface = *vpx_get_worker_interface();
@@ -232,88 +213,81 @@
   EXPECT_EQ(expected_md5, DecodeFile(filename, 2));
 }
 
-TEST(VP9DecodeMultiThreadedTest, NoTilesNonFrameParallel) {
-  // no tiles or frame parallel; this exercises loop filter threading.
-  EXPECT_EQ("b35a1b707b28e82be025d960aba039bc",
-            DecodeFile("vp90-2-03-size-226x226.webm", 2));
+struct FileParam {
+  const char *name;
+  const char *expected_md5;
+  friend std::ostream &operator<<(std::ostream &os, const FileParam &param) {
+    return os << "file name: " << param.name
+              << " digest: " << param.expected_md5;
+  }
+};
+
+class VP9DecodeMultiThreadedTest : public ::testing::TestWithParam<FileParam> {
+};
+
+TEST_P(VP9DecodeMultiThreadedTest, Decode) {
+  for (int t = 1; t <= 8; ++t) {
+    EXPECT_EQ(GetParam().expected_md5, DecodeFile(GetParam().name, t))
+        << "threads = " << t;
+  }
 }
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallel) {
-  static const FileList files[] = { { "vp90-2-08-tile_1x2_frame_parallel.webm",
-                                      "68ede6abd66bae0a2edf2eb9232241b6" },
-                                    { "vp90-2-08-tile_1x4_frame_parallel.webm",
-                                      "368ebc6ebf3a5e478d85b2c3149b2848" },
-                                    { "vp90-2-08-tile_1x8_frame_parallel.webm",
-                                      "17e439da2388aff3a0f69cb22579c6c1" },
-                                    { nullptr, nullptr } };
+const FileParam kNoTilesNonFrameParallelFiles[] = {
+  { "vp90-2-03-size-226x226.webm", "b35a1b707b28e82be025d960aba039bc" }
+};
 
-  DecodeFiles(files);
-}
+const FileParam kFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2_frame_parallel.webm",
+    "68ede6abd66bae0a2edf2eb9232241b6" },
+  { "vp90-2-08-tile_1x4_frame_parallel.webm",
+    "368ebc6ebf3a5e478d85b2c3149b2848" },
+  { "vp90-2-08-tile_1x8_frame_parallel.webm",
+    "17e439da2388aff3a0f69cb22579c6c1" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, FrameParallelResize) {
-  static const FileList files[] = {
-    { "vp90-2-14-resize-fp-tiles-1-16.webm",
-      "0cd5e632c326297e975f38949c31ea94" },
-    { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
-      "5c78a96a42e7f4a4f6b2edcdb791e44c" },
-    { "vp90-2-14-resize-fp-tiles-1-2.webm",
-      "e030450ae85c3277be2a418769df98e2" },
-    { "vp90-2-14-resize-fp-tiles-1-4.webm",
-      "312eed4e2b64eb7a4e7f18916606a430" },
-    { "vp90-2-14-resize-fp-tiles-16-1.webm",
-      "1755c16d8af16a9cb3fe7338d90abe52" },
-    { "vp90-2-14-resize-fp-tiles-16-2.webm",
-      "500300592d3fcb6f12fab25e48aaf4df" },
-    { "vp90-2-14-resize-fp-tiles-16-4.webm",
-      "47c48379fa6331215d91c67648e1af6e" },
-    { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
-      "eecf17290739bc708506fa4827665989" },
-    { "vp90-2-14-resize-fp-tiles-16-8.webm",
-      "29b6bb54e4c26b5ca85d5de5fed94e76" },
-    { "vp90-2-14-resize-fp-tiles-1-8.webm",
-      "1b6f175e08cd82cf84bb800ac6d1caa3" },
-    { "vp90-2-14-resize-fp-tiles-2-16.webm",
-      "ca3b03e4197995d8d5444ede7a6c0804" },
-    { "vp90-2-14-resize-fp-tiles-2-1.webm",
-      "99aec065369d70bbb78ccdff65afed3f" },
-    { "vp90-2-14-resize-fp-tiles-2-4.webm",
-      "22d0ebdb49b87d2920a85aea32e1afd5" },
-    { "vp90-2-14-resize-fp-tiles-2-8.webm",
-      "c2115cf051c62e0f7db1d4a783831541" },
-    { "vp90-2-14-resize-fp-tiles-4-16.webm",
-      "c690d7e1719b31367564cac0af0939cb" },
-    { "vp90-2-14-resize-fp-tiles-4-1.webm",
-      "a926020b2cc3e15ad4cc271853a0ff26" },
-    { "vp90-2-14-resize-fp-tiles-4-2.webm",
-      "42699063d9e581f1993d0cf890c2be78" },
-    { "vp90-2-14-resize-fp-tiles-4-8.webm",
-      "7f76d96036382f45121e3d5aa6f8ec52" },
-    { "vp90-2-14-resize-fp-tiles-8-16.webm",
-      "76a43fcdd7e658542913ea43216ec55d" },
-    { "vp90-2-14-resize-fp-tiles-8-1.webm",
-      "8e3fbe89486ca60a59299dea9da91378" },
-    { "vp90-2-14-resize-fp-tiles-8-2.webm",
-      "ae96f21f21b6370cc0125621b441fc52" },
-    { "vp90-2-14-resize-fp-tiles-8-4.webm",
-      "3eb4f24f10640d42218f7fd7b9fd30d4" },
-    { nullptr, nullptr }
-  };
+const FileParam kFrameParallelResizeFiles[] = {
+  { "vp90-2-14-resize-fp-tiles-1-16.webm", "0cd5e632c326297e975f38949c31ea94" },
+  { "vp90-2-14-resize-fp-tiles-1-2-4-8-16.webm",
+    "5c78a96a42e7f4a4f6b2edcdb791e44c" },
+  { "vp90-2-14-resize-fp-tiles-1-2.webm", "e030450ae85c3277be2a418769df98e2" },
+  { "vp90-2-14-resize-fp-tiles-1-4.webm", "312eed4e2b64eb7a4e7f18916606a430" },
+  { "vp90-2-14-resize-fp-tiles-16-1.webm", "1755c16d8af16a9cb3fe7338d90abe52" },
+  { "vp90-2-14-resize-fp-tiles-16-2.webm", "500300592d3fcb6f12fab25e48aaf4df" },
+  { "vp90-2-14-resize-fp-tiles-16-4.webm", "47c48379fa6331215d91c67648e1af6e" },
+  { "vp90-2-14-resize-fp-tiles-16-8-4-2-1.webm",
+    "eecf17290739bc708506fa4827665989" },
+  { "vp90-2-14-resize-fp-tiles-16-8.webm", "29b6bb54e4c26b5ca85d5de5fed94e76" },
+  { "vp90-2-14-resize-fp-tiles-1-8.webm", "1b6f175e08cd82cf84bb800ac6d1caa3" },
+  { "vp90-2-14-resize-fp-tiles-2-16.webm", "ca3b03e4197995d8d5444ede7a6c0804" },
+  { "vp90-2-14-resize-fp-tiles-2-1.webm", "99aec065369d70bbb78ccdff65afed3f" },
+  { "vp90-2-14-resize-fp-tiles-2-4.webm", "22d0ebdb49b87d2920a85aea32e1afd5" },
+  { "vp90-2-14-resize-fp-tiles-2-8.webm", "c2115cf051c62e0f7db1d4a783831541" },
+  { "vp90-2-14-resize-fp-tiles-4-16.webm", "c690d7e1719b31367564cac0af0939cb" },
+  { "vp90-2-14-resize-fp-tiles-4-1.webm", "a926020b2cc3e15ad4cc271853a0ff26" },
+  { "vp90-2-14-resize-fp-tiles-4-2.webm", "42699063d9e581f1993d0cf890c2be78" },
+  { "vp90-2-14-resize-fp-tiles-4-8.webm", "7f76d96036382f45121e3d5aa6f8ec52" },
+  { "vp90-2-14-resize-fp-tiles-8-16.webm", "76a43fcdd7e658542913ea43216ec55d" },
+  { "vp90-2-14-resize-fp-tiles-8-1.webm", "8e3fbe89486ca60a59299dea9da91378" },
+  { "vp90-2-14-resize-fp-tiles-8-2.webm", "ae96f21f21b6370cc0125621b441fc52" },
+  { "vp90-2-14-resize-fp-tiles-8-4.webm", "3eb4f24f10640d42218f7fd7b9fd30d4" },
+};
 
-  DecodeFiles(files);
-}
+const FileParam kNonFrameParallelFiles[] = {
+  { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
+  { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
+  { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
+  { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
+  { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
+};
 
-TEST(VP9DecodeMultiThreadedTest, NonFrameParallel) {
-  static const FileList files[] = {
-    { "vp90-2-08-tile_1x2.webm", "570b4a5d5a70d58b5359671668328a16" },
-    { "vp90-2-08-tile_1x4.webm", "988d86049e884c66909d2d163a09841a" },
-    { "vp90-2-08-tile_1x8.webm", "0941902a52e9092cb010905eab16364c" },
-    { "vp90-2-08-tile-4x1.webm", "06505aade6647c583c8e00a2f582266f" },
-    { "vp90-2-08-tile-4x4.webm", "85c2299892460d76e2c600502d52bfe2" },
-    { nullptr, nullptr }
-  };
-
-  DecodeFiles(files);
-}
+INSTANTIATE_TEST_SUITE_P(NoTilesNonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNoTilesNonFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelFiles));
+INSTANTIATE_TEST_SUITE_P(FrameParallelResize, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kFrameParallelResizeFiles));
+INSTANTIATE_TEST_SUITE_P(NonFrameParallel, VP9DecodeMultiThreadedTest,
+                         ::testing::ValuesIn(kNonFrameParallelFiles));
 #endif  // CONFIG_WEBM_IO
 
 INSTANTIATE_TEST_SUITE_P(Synchronous, VPxWorkerThreadTest, ::testing::Bool());
diff --git a/libvpx/test/webm_video_source.h b/libvpx/test/webm_video_source.h
index 6f55f7d..d245926 100644
--- a/libvpx/test/webm_video_source.h
+++ b/libvpx/test/webm_video_source.h
@@ -26,11 +26,11 @@
  public:
   explicit WebMVideoSource(const std::string &file_name)
       : file_name_(file_name), vpx_ctx_(new VpxInputContext()),
-        webm_ctx_(new WebmInputContext()), buf_(NULL), buf_sz_(0), frame_(0),
+        webm_ctx_(new WebmInputContext()), buf_(nullptr), buf_sz_(0), frame_(0),
         end_of_file_(false) {}
 
   virtual ~WebMVideoSource() {
-    if (vpx_ctx_->file != NULL) fclose(vpx_ctx_->file);
+    if (vpx_ctx_->file != nullptr) fclose(vpx_ctx_->file);
     webm_free(webm_ctx_);
     delete vpx_ctx_;
     delete webm_ctx_;
@@ -40,7 +40,7 @@
 
   virtual void Begin() {
     vpx_ctx_->file = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(vpx_ctx_->file != NULL)
+    ASSERT_NE(vpx_ctx_->file, nullptr)
         << "Input file open failed. Filename: " << file_name_;
 
     ASSERT_EQ(file_is_webm(webm_ctx_, vpx_ctx_), 1) << "file is not WebM";
@@ -54,7 +54,7 @@
   }
 
   void FillFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
     ASSERT_GE(status, 0) << "webm_read_frame failed";
     if (status == 1) {
@@ -63,7 +63,7 @@
   }
 
   void SeekToNextKeyFrame() {
-    ASSERT_TRUE(vpx_ctx_->file != NULL);
+    ASSERT_NE(vpx_ctx_->file, nullptr);
     do {
       const int status = webm_read_frame(webm_ctx_, &buf_, &buf_sz_);
       ASSERT_GE(status, 0) << "webm_read_frame failed";
@@ -74,7 +74,9 @@
     } while (!webm_ctx_->is_key_frame && !end_of_file_);
   }
 
-  virtual const uint8_t *cxdata() const { return end_of_file_ ? NULL : buf_; }
+  virtual const uint8_t *cxdata() const {
+    return end_of_file_ ? nullptr : buf_;
+  }
   virtual size_t frame_size() const { return buf_sz_; }
   virtual unsigned int frame_number() const { return frame_; }
 
diff --git a/libvpx/test/y4m_test.cc b/libvpx/test/y4m_test.cc
index 8272263..32f2cd5 100644
--- a/libvpx/test/y4m_test.cc
+++ b/libvpx/test/y4m_test.cc
@@ -196,12 +196,13 @@
 
 TEST(Y4MHeaderTest, RegularHeader) {
   libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
   fwrite(kY4MRegularHeader, 1, sizeof(kY4MRegularHeader), f.file());
   fflush(f.file());
   EXPECT_EQ(0, fseek(f.file(), 0, 0));
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
@@ -222,12 +223,13 @@
 
 TEST(Y4MHeaderTest, LongHeader) {
   libvpx_test::TempOutFile f;
+  ASSERT_NE(f.file(), nullptr);
   fwrite(kY4MLongHeader, 1, sizeof(kY4MLongHeader), f.file());
   fflush(f.file());
   EXPECT_EQ(fseek(f.file(), 0, 0), 0);
 
   y4m_input y4m;
-  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/NULL,
+  EXPECT_EQ(y4m_input_open(&y4m, f.file(), /*skip_buffer=*/nullptr,
                            /*num_skip=*/0, /*only_420=*/0),
             0);
   EXPECT_EQ(y4m.pic_w, 4);
diff --git a/libvpx/test/y4m_video_source.h b/libvpx/test/y4m_video_source.h
index 89aa2a4..71fbf31 100644
--- a/libvpx/test/y4m_video_source.h
+++ b/libvpx/test/y4m_video_source.h
@@ -23,7 +23,7 @@
 class Y4mVideoSource : public VideoSource {
  public:
   Y4mVideoSource(const std::string &file_name, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(new vpx_image_t()),
+      : file_name_(file_name), input_file_(nullptr), img_(new vpx_image_t()),
         start_(start), limit_(limit), frame_(0), framerate_numerator_(0),
         framerate_denominator_(0), y4m_() {}
 
@@ -35,13 +35,13 @@
   virtual void OpenSource() {
     CloseSource();
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
   }
 
   virtual void ReadSourceToStart() {
-    ASSERT_TRUE(input_file_ != NULL);
-    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, NULL, 0, 0));
+    ASSERT_NE(input_file_, nullptr);
+    ASSERT_FALSE(y4m_input_open(&y4m_, input_file_, nullptr, 0, 0));
     framerate_numerator_ = y4m_.fps_n;
     framerate_denominator_ = y4m_.fps_d;
     frame_ = 0;
@@ -62,7 +62,7 @@
   }
 
   virtual vpx_image_t *img() const {
-    return (frame_ < limit_) ? img_.get() : NULL;
+    return (frame_ < limit_) ? img_.get() : nullptr;
   }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
@@ -80,7 +80,7 @@
   virtual unsigned int limit() const { return limit_; }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     y4m_input_fetch_frame(&y4m_, input_file_, img_.get());
   }
@@ -101,9 +101,9 @@
   void CloseSource() {
     y4m_input_close(&y4m_);
     y4m_ = y4m_input();
-    if (input_file_ != NULL) {
+    if (input_file_ != nullptr) {
       fclose(input_file_);
-      input_file_ = NULL;
+      input_file_ = nullptr;
     }
   }
 
diff --git a/libvpx/test/yuv_temporal_filter_test.cc b/libvpx/test/yuv_temporal_filter_test.cc
index cfdc88d..2bdcf4d 100644
--- a/libvpx/test/yuv_temporal_filter_test.cc
+++ b/libvpx/test/yuv_temporal_filter_test.cc
@@ -674,8 +674,8 @@
          v_count);                                                            \
   }
 
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_c, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     C, YUVTemporalFilterTest,
@@ -683,8 +683,8 @@
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_10, 10),
         TemporalFilterWithBd(&wrap_vp9_highbd_apply_temporal_filter_c_12, 12)));
 #if HAVE_SSE4_1
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10);
-WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12);
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 10)
+WRAP_HIGHBD_FUNC(vp9_highbd_apply_temporal_filter_sse4_1, 12)
 
 INSTANTIATE_TEST_SUITE_P(
     SSE4_1, YUVTemporalFilterTest,
diff --git a/libvpx/test/yuv_video_source.h b/libvpx/test/yuv_video_source.h
index 383ab8f..51948c0 100644
--- a/libvpx/test/yuv_video_source.h
+++ b/libvpx/test/yuv_video_source.h
@@ -27,8 +27,8 @@
   YUVVideoSource(const std::string &file_name, vpx_img_fmt format,
                  unsigned int width, unsigned int height, int rate_numerator,
                  int rate_denominator, unsigned int start, int limit)
-      : file_name_(file_name), input_file_(NULL), img_(NULL), start_(start),
-        limit_(limit), frame_(0), width_(0), height_(0),
+      : file_name_(file_name), input_file_(nullptr), img_(nullptr),
+        start_(start), limit_(limit), frame_(0), width_(0), height_(0),
         format_(VPX_IMG_FMT_NONE), framerate_numerator_(rate_numerator),
         framerate_denominator_(rate_denominator) {
     // This initializes format_, raw_size_, width_, height_ and allocates img.
@@ -43,7 +43,7 @@
   virtual void Begin() {
     if (input_file_) fclose(input_file_);
     input_file_ = OpenTestDataFile(file_name_);
-    ASSERT_TRUE(input_file_ != NULL)
+    ASSERT_NE(input_file_, nullptr)
         << "Input file open failed. Filename: " << file_name_;
     if (start_) {
       fseek(input_file_, static_cast<unsigned>(raw_size_) * start_, SEEK_SET);
@@ -58,7 +58,9 @@
     FillFrame();
   }
 
-  virtual vpx_image_t *img() const { return (frame_ < limit_) ? img_ : NULL; }
+  virtual vpx_image_t *img() const {
+    return (frame_ < limit_) ? img_ : nullptr;
+  }
 
   // Models a stream where Timebase = 1/FPS, so pts == frame.
   virtual vpx_codec_pts_t pts() const { return frame_; }
@@ -78,8 +80,8 @@
                        vpx_img_fmt format) {
     if (width != width_ || height != height_ || format != format_) {
       vpx_img_free(img_);
-      img_ = vpx_img_alloc(NULL, format, width, height, 1);
-      ASSERT_TRUE(img_ != NULL);
+      img_ = vpx_img_alloc(nullptr, format, width, height, 1);
+      ASSERT_NE(img_, nullptr);
       width_ = width;
       height_ = height;
       format_ = format;
@@ -99,7 +101,7 @@
   }
 
   virtual void FillFrame() {
-    ASSERT_TRUE(input_file_ != NULL);
+    ASSERT_NE(input_file_, nullptr);
     // Read a frame from input_file.
     if (fread(img_->img_data, raw_size_, 1, input_file_) == 0) {
       limit_ = frame_;
diff --git a/libvpx/third_party/googletest/README.libvpx b/libvpx/third_party/googletest/README.libvpx
index ed55fb0..b9a7492 100644
--- a/libvpx/third_party/googletest/README.libvpx
+++ b/libvpx/third_party/googletest/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://github.com/google/googletest.git
-Version: release-1.10.0-224-g23b2a3b1
+Version: release-1.11.0
 License: BSD
 License File: LICENSE
 
@@ -13,11 +13,9 @@
 
 Local Modifications:
 - Remove everything but:
+  CONTRIBUTORS
   googletest/
-   CONTRIBUTORS
    include
-   LICENSE
    README.md
    src
-- Enable kErrorOnUninstantiatedParameterizedTest and
-  kErrorOnUninstantiatedTypeParameterizedTest in gtest.cc
+  LICENSE
diff --git a/libvpx/third_party/googletest/src/CONTRIBUTORS b/libvpx/third_party/googletest/src/CONTRIBUTORS
index 1e4afe2..76db0b4 100644
--- a/libvpx/third_party/googletest/src/CONTRIBUTORS
+++ b/libvpx/third_party/googletest/src/CONTRIBUTORS
@@ -5,34 +5,59 @@
 
 Ajay Joshi <jaj@google.com>
 Balázs Dán <balazs.dan@gmail.com>
+Benoit Sigoure <tsuna@google.com>
 Bharat Mediratta <bharat@menalto.com>
+Bogdan Piloca <boo@google.com>
 Chandler Carruth <chandlerc@google.com>
 Chris Prince <cprince@google.com>
 Chris Taylor <taylorc@google.com>
 Dan Egnor <egnor@google.com>
+Dave MacLachlan <dmaclach@gmail.com>
+David Anderson <danderson@google.com>
+Dean Sturtevant
 Eric Roman <eroman@chromium.org>
+Gene Volovich <gv@cite.com>
 Hady Zalek <hady.zalek@gmail.com>
+Hal Burch <gmock@hburch.com>
 Jeffrey Yasskin <jyasskin@google.com>
+Jim Keller <jimkeller@google.com>
+Joe Walnes <joe@truemesh.com>
+Jon Wray <jwray@google.com>
 Jói Sigurðsson <joi@google.com>
 Keir Mierle <mierle@gmail.com>
 Keith Ray <keith.ray@gmail.com>
 Kenton Varda <kenton@google.com>
+Kostya Serebryany <kcc@google.com>
 Krystian Kuzniarek <krystian.kuzniarek@gmail.com>
+Lev Makhlis
 Manuel Klimek <klimek@google.com>
+Mario Tanev <radix@google.com>
+Mark Paskin
 Markus Heule <markus.heule@gmail.com>
+Matthew Simmons <simmonmt@acm.org>
 Mika Raento <mikie@iki.fi>
+Mike Bland <mbland@google.com>
 Miklós Fazekas <mfazekas@szemafor.com>
+Neal Norwitz <nnorwitz@gmail.com>
+Nermin Ozkiranartli <nermin@google.com>
+Owen Carlsen <ocarlsen@google.com>
+Paneendra Ba <paneendra@google.com>
 Pasi Valminen <pasi.valminen@gmail.com>
 Patrick Hanna <phanna@google.com>
 Patrick Riley <pfr@google.com>
+Paul Menage <menage@google.com>
 Peter Kaminski <piotrk@google.com>
+Piotr Kaminski <piotrk@google.com>
 Preston Jackson <preston.a.jackson@gmail.com>
 Rainer Klaffenboeck <rainer.klaffenboeck@dynatrace.com>
 Russ Cox <rsc@google.com>
 Russ Rufer <russ@pentad.com>
 Sean Mcafee <eefacm@gmail.com>
 Sigurður Ásgeirsson <siggi@google.com>
+Sverre Sundsdal <sundsdal@gmail.com>
+Takeshi Yoshino <tyoshino@google.com>
 Tracy Bialik <tracy@pentad.com>
 Vadim Berman <vadimb@google.com>
 Vlad Losev <vladl@google.com>
+Wolfgang Klier <wklier@google.com>
 Zhanyong Wan <wan@google.com>
diff --git a/libvpx/third_party/googletest/src/README.md b/libvpx/third_party/googletest/src/README.md
index 904048f..1f8b349 100644
--- a/libvpx/third_party/googletest/src/README.md
+++ b/libvpx/third_party/googletest/src/README.md
@@ -2,39 +2,51 @@
 
 #### Setup
 
-To build Google Test and your tests that use it, you need to tell your build
+To build GoogleTest and your tests that use it, you need to tell your build
 system where to find its headers and source files. The exact way to do it
 depends on which build system you use, and is usually straightforward.
 
 ### Build with CMake
 
-Google Test comes with a CMake build script
+GoogleTest comes with a CMake build script
 ([CMakeLists.txt](https://github.com/google/googletest/blob/master/CMakeLists.txt))
 that can be used on a wide range of platforms ("C" stands for cross-platform.).
 If you don't have CMake installed already, you can download it for free from
 <http://www.cmake.org/>.
 
 CMake works by generating native makefiles or build projects that can be used in
-the compiler environment of your choice. You can either build Google Test as a
+the compiler environment of your choice. You can either build GoogleTest as a
 standalone project or it can be incorporated into an existing CMake build for
 another project.
 
 #### Standalone CMake Project
 
-When building Google Test as a standalone project, the typical workflow starts
-with:
-
-    mkdir mybuild       # Create a directory to hold the build output.
-    cd mybuild
-    cmake ${GTEST_DIR}  # Generate native build scripts.
-
-If you want to build Google Test's samples, you should replace the last command
+When building GoogleTest as a standalone project, the typical workflow starts
 with
 
-    cmake -Dgtest_build_samples=ON ${GTEST_DIR}
+```
+git clone https://github.com/google/googletest.git -b release-1.10.0
+cd googletest        # Main directory of the cloned repository.
+mkdir build          # Create a directory to hold the build output.
+cd build
+cmake ..             # Generate native build scripts for GoogleTest.
+```
+
+The above command also includes GoogleMock by default. And so, if you want to
+build only GoogleTest, you should replace the last command with
+
+```
+cmake .. -DBUILD_GMOCK=OFF
+```
 
 If you are on a \*nix system, you should now see a Makefile in the current
-directory. Just type 'make' to build gtest.
+directory. Just type `make` to build GoogleTest. And then you can simply install
+GoogleTest if you are a system administrator.
+
+```
+make
+sudo make install    # Install in /usr/local/ by default
+```
 
 If you use Windows and have Visual Studio installed, a `gtest.sln` file and
 several `.vcproj` files will be created. You can then build them using Visual
@@ -44,13 +56,19 @@
 
 #### Incorporating Into An Existing CMake Project
 
-If you want to use gtest in a project which already uses CMake, then a more
-robust and flexible approach is to build gtest as part of that project directly.
-This is done by making the GoogleTest source code available to the main build
-and adding it using CMake's `add_subdirectory()` command. This has the
-significant advantage that the same compiler and linker settings are used
-between gtest and the rest of your project, so issues associated with using
-incompatible libraries (eg debug/release), etc. are avoided. This is
+If you want to use GoogleTest in a project which already uses CMake, the easiest
+way is to get installed libraries and headers.
+
+*   Import GoogleTest by using `find_package` (or `pkg_check_modules`). For
+    example, if `find_package(GTest CONFIG REQUIRED)` succeeds, you can use the
+    libraries as `GTest::gtest`, `GTest::gmock`.
+
+And a more robust and flexible approach is to build GoogleTest as part of that
+project directly. This is done by making the GoogleTest source code available to
+the main build and adding it using CMake's `add_subdirectory()` command. This
+has the significant advantage that the same compiler and linker settings are
+used between GoogleTest and the rest of your project, so issues associated with
+using incompatible libraries (eg debug/release), etc. are avoided. This is
 particularly useful on Windows. Making GoogleTest's source code available to the
 main build can be done a few different ways:
 
@@ -64,68 +82,23 @@
     possible or appropriate. Git submodules, for example, have their own set of
     advantages and drawbacks.
 *   Use CMake to download GoogleTest as part of the build's configure step. This
-    is just a little more complex, but doesn't have the limitations of the other
-    methods.
+    approach doesn't have the limitations of the other methods.
 
-The last of the above methods is implemented with a small piece of CMake code in
-a separate file (e.g. `CMakeLists.txt.in`) which is copied to the build area and
-then invoked as a sub-build _during the CMake stage_. That directory is then
-pulled into the main build with `add_subdirectory()`. For example:
+The last of the above methods is implemented with a small piece of CMake code
+that downloads and pulls the GoogleTest code into the main build.
 
-New file `CMakeLists.txt.in`:
+Just add to your `CMakeLists.txt`:
 
 ```cmake
-cmake_minimum_required(VERSION 2.8.2)
-
-project(googletest-download NONE)
-
-include(ExternalProject)
-ExternalProject_Add(googletest
-  GIT_REPOSITORY    https://github.com/google/googletest.git
-  GIT_TAG           master
-  SOURCE_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-src"
-  BINARY_DIR        "${CMAKE_CURRENT_BINARY_DIR}/googletest-build"
-  CONFIGURE_COMMAND ""
-  BUILD_COMMAND     ""
-  INSTALL_COMMAND   ""
-  TEST_COMMAND      ""
+include(FetchContent)
+FetchContent_Declare(
+  googletest
+  # Specify the commit you depend on and update it regularly.
+  URL https://github.com/google/googletest/archive/609281088cfefc76f9d0ce82e1ff6c30cc3591e5.zip
 )
-```
-
-Existing build's `CMakeLists.txt`:
-
-```cmake
-# Download and unpack googletest at configure time
-configure_file(CMakeLists.txt.in googletest-download/CMakeLists.txt)
-execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "CMake step for googletest failed: ${result}")
-endif()
-execute_process(COMMAND ${CMAKE_COMMAND} --build .
-  RESULT_VARIABLE result
-  WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/googletest-download )
-if(result)
-  message(FATAL_ERROR "Build step for googletest failed: ${result}")
-endif()
-
-# Prevent overriding the parent project's compiler/linker
-# settings on Windows
+# For Windows: Prevent overriding the parent project's compiler/linker settings
 set(gtest_force_shared_crt ON CACHE BOOL "" FORCE)
-
-# Add googletest directly to our build. This defines
-# the gtest and gtest_main targets.
-add_subdirectory(${CMAKE_CURRENT_BINARY_DIR}/googletest-src
-                 ${CMAKE_CURRENT_BINARY_DIR}/googletest-build
-                 EXCLUDE_FROM_ALL)
-
-# The gtest/gtest_main targets carry header search path
-# dependencies automatically when using CMake 2.8.11 or
-# later. Otherwise we have to add them here ourselves.
-if (CMAKE_VERSION VERSION_LESS 2.8.11)
-  include_directories("${gtest_SOURCE_DIR}/include")
-endif()
+FetchContent_MakeAvailable(googletest)
 
 # Now simply link against gtest or gtest_main as needed. Eg
 add_executable(example example.cpp)
@@ -133,20 +106,18 @@
 add_test(NAME example_test COMMAND example)
 ```
 
-Note that this approach requires CMake 2.8.2 or later due to its use of the
-`ExternalProject_Add()` command. The above technique is discussed in more detail
-in [this separate article](http://crascit.com/2015/07/25/cmake-gtest/) which
-also contains a link to a fully generalized implementation of the technique.
+Note that this approach requires CMake 3.14 or later due to its use of the
+`FetchContent_MakeAvailable()` command.
 
 ##### Visual Studio Dynamic vs Static Runtimes
 
 By default, new Visual Studio projects link the C runtimes dynamically but
-Google Test links them statically. This will generate an error that looks
+GoogleTest links them statically. This will generate an error that looks
 something like the following: gtest.lib(gtest-all.obj) : error LNK2038: mismatch
 detected for 'RuntimeLibrary': value 'MTd_StaticDebug' doesn't match value
 'MDd_DynamicDebug' in main.obj
 
-Google Test already has a CMake option for this: `gtest_force_shared_crt`
+GoogleTest already has a CMake option for this: `gtest_force_shared_crt`
 
 Enabling this option will make gtest link the runtimes dynamically too, and
 match the project in which it is included.
@@ -154,17 +125,17 @@
 #### C++ Standard Version
 
 An environment that supports C++11 is required in order to successfully build
-Google Test. One way to ensure this is to specify the standard in the top-level
+GoogleTest. One way to ensure this is to specify the standard in the top-level
 project, for example by using the `set(CMAKE_CXX_STANDARD 11)` command. If this
-is not feasible, for example in a C project using Google Test for validation,
+is not feasible, for example in a C project using GoogleTest for validation,
 then it can be specified by adding it to the options for cmake via the
 `DCMAKE_CXX_FLAGS` option.
 
-### Tweaking Google Test
+### Tweaking GoogleTest
 
-Google Test can be used in diverse environments. The default configuration may
+GoogleTest can be used in diverse environments. The default configuration may
 not work (or may not work well) out of the box in some environments. However,
-you can easily tweak Google Test by defining control macros on the compiler
+you can easily tweak GoogleTest by defining control macros on the compiler
 command line. Generally, these macros are named like `GTEST_XYZ` and you define
 them to either 1 or 0 to enable or disable a certain feature.
 
@@ -173,12 +144,12 @@
 
 ### Multi-threaded Tests
 
-Google Test is thread-safe where the pthread library is available. After
+GoogleTest is thread-safe where the pthread library is available. After
 `#include "gtest/gtest.h"`, you can check the
 `GTEST_IS_THREADSAFE` macro to see whether this is the case (yes if the macro is
 `#defined` to 1, no if it's undefined.).
 
-If Google Test doesn't correctly detect whether pthread is available in your
+If GoogleTest doesn't correctly detect whether pthread is available in your
 environment, you can force it with
 
     -DGTEST_HAS_PTHREAD=1
@@ -187,16 +158,16 @@
 
     -DGTEST_HAS_PTHREAD=0
 
-When Google Test uses pthread, you may need to add flags to your compiler and/or
+When GoogleTest uses pthread, you may need to add flags to your compiler and/or
 linker to select the pthread library, or you'll get link errors. If you use the
-CMake script or the deprecated Autotools script, this is taken care of for you.
-If you use your own build script, you'll need to read your compiler and linker's
-manual to figure out what flags to add.
+CMake script, this is taken care of for you. If you use your own build script,
+you'll need to read your compiler and linker's manual to figure out what flags
+to add.
 
 ### As a Shared Library (DLL)
 
-Google Test is compact, so most users can build and link it as a static library
-for the simplicity. You can choose to use Google Test as a shared library (known
+GoogleTest is compact, so most users can build and link it as a static library
+for the simplicity. You can choose to use GoogleTest as a shared library (known
 as a DLL on Windows) if you prefer.
 
 To compile *gtest* as a shared library, add
@@ -216,22 +187,22 @@
 compilers (e.g. GCC), they may become necessary in the future, if we decide to
 improve the speed of loading the library (see
 <http://gcc.gnu.org/wiki/Visibility> for details). Therefore you are recommended
-to always add the above flags when using Google Test as a shared library.
-Otherwise a future release of Google Test may break your build script.
+to always add the above flags when using GoogleTest as a shared library.
+Otherwise a future release of GoogleTest may break your build script.
 
 ### Avoiding Macro Name Clashes
 
 In C++, macros don't obey namespaces. Therefore two libraries that both define a
 macro of the same name will clash if you `#include` both definitions. In case a
-Google Test macro clashes with another library, you can force Google Test to
+GoogleTest macro clashes with another library, you can force GoogleTest to
 rename its macro to avoid the conflict.
 
-Specifically, if both Google Test and some other code define macro FOO, you can
+Specifically, if both GoogleTest and some other code define macro FOO, you can
 add
 
     -DGTEST_DONT_DEFINE_FOO=1
 
-to the compiler flags to tell Google Test to change the macro's name from `FOO`
+to the compiler flags to tell GoogleTest to change the macro's name from `FOO`
 to `GTEST_FOO`. Currently `FOO` can be `FAIL`, `SUCCEED`, or `TEST`. For
 example, with `-DGTEST_DONT_DEFINE_TEST=1`, you'll need to write
 
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
index dc878ff..9b4d4d1 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-death-test.h
@@ -35,8 +35,8 @@
 // directly.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
 
 #include "gtest/internal/gtest-death-test-internal.h"
 
@@ -97,6 +97,10 @@
 //
 //   ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!");
 //
+// The final parameter to each of these macros is a matcher applied to any data
+// the sub-process wrote to stderr.  For compatibility with existing tests, a
+// bare string is interpreted as a regular expression matcher.
+//
 // On the regular expressions used in death tests:
 //
 //   GOOGLETEST_CM0005 DO NOT DELETE
@@ -162,27 +166,27 @@
 //   directory in PATH.
 //
 
-// Asserts that a given statement causes the program to exit, with an
-// integer exit status that satisfies predicate, and emitting error output
-// that matches regex.
-# define ASSERT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_)
+// Asserts that a given `statement` causes the program to exit, with an
+// integer exit status that satisfies `predicate`, and emitting error output
+// that matches `matcher`.
+# define ASSERT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_FATAL_FAILURE_)
 
-// Like ASSERT_EXIT, but continues on to successive tests in the
+// Like `ASSERT_EXIT`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_EXIT(statement, predicate, regex) \
-    GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_)
+# define EXPECT_EXIT(statement, predicate, matcher) \
+    GTEST_DEATH_TEST_(statement, predicate, matcher, GTEST_NONFATAL_FAILURE_)
 
-// Asserts that a given statement causes the program to exit, either by
+// Asserts that a given `statement` causes the program to exit, either by
 // explicitly exiting with a nonzero exit code or being killed by a
-// signal, and emitting error output that matches regex.
-# define ASSERT_DEATH(statement, regex) \
-    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+// signal, and emitting error output that matches `matcher`.
+# define ASSERT_DEATH(statement, matcher) \
+    ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
-// Like ASSERT_DEATH, but continues on to successive tests in the
+// Like `ASSERT_DEATH`, but continues on to successive tests in the
 // test suite, if any:
-# define EXPECT_DEATH(statement, regex) \
-    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex)
+# define EXPECT_DEATH(statement, matcher) \
+    EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, matcher)
 
 // Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*:
 
@@ -190,11 +194,10 @@
 class GTEST_API_ ExitedWithCode {
  public:
   explicit ExitedWithCode(int exit_code);
+  ExitedWithCode(const ExitedWithCode&) = default;
+  void operator=(const ExitedWithCode& other) = delete;
   bool operator()(int exit_status) const;
  private:
-  // No implementation - assignment is unsupported.
-  void operator=(const ExitedWithCode& other);
-
   const int exit_code_;
 };
 
@@ -340,4 +343,4 @@
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
index a61cef4..9fa34a0 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-matchers.h
@@ -32,13 +32,10 @@
 // This file implements just enough of the matcher interface to allow
 // EXPECT_DEATH and friends to accept a matcher argument.
 
-// IWYU pragma: private, include "testing/base/public/gunit.h"
-// IWYU pragma: friend third_party/googletest/googlemock/.*
-// IWYU pragma: friend third_party/googletest/googletest/.*
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
-
+#include <atomic>
 #include <memory>
 #include <ostream>
 #include <string>
@@ -63,20 +60,16 @@
 namespace testing {
 
 // To implement a matcher Foo for type T, define:
-//   1. a class FooMatcherImpl that implements the
-//      MatcherInterface<T> interface, and
+//   1. a class FooMatcherMatcher that implements the matcher interface:
+//     using is_gtest_matcher = void;
+//     bool MatchAndExplain(const T&, std::ostream*);
+//       (MatchResultListener* can also be used instead of std::ostream*)
+//     void DescribeTo(std::ostream*);
+//     void DescribeNegationTo(std::ostream*);
+//
 //   2. a factory function that creates a Matcher<T> object from a
-//      FooMatcherImpl*.
-//
-// The two-level delegation design makes it possible to allow a user
-// to write "v" instead of "Eq(v)" where a Matcher is expected, which
-// is impossible if we pass matchers by pointers.  It also eases
-// ownership management as Matcher objects can now be copied like
-// plain values.
+//      FooMatcherMatcher.
 
-// MatchResultListener is an abstract class.  Its << operator can be
-// used by a matcher to explain why a value matches or doesn't match.
-//
 class MatchResultListener {
  public:
   // Creates a listener object with the given underlying ostream.  The
@@ -113,7 +106,7 @@
 
 // An instance of a subclass of this knows how to describe itself as a
 // matcher.
-class MatcherDescriberInterface {
+class GTEST_API_ MatcherDescriberInterface {
  public:
   virtual ~MatcherDescriberInterface() {}
 
@@ -181,31 +174,6 @@
 
 namespace internal {
 
-// Converts a MatcherInterface<T> to a MatcherInterface<const T&>.
-template <typename T>
-class MatcherInterfaceAdapter : public MatcherInterface<const T&> {
- public:
-  explicit MatcherInterfaceAdapter(const MatcherInterface<T>* impl)
-      : impl_(impl) {}
-  ~MatcherInterfaceAdapter() override { delete impl_; }
-
-  void DescribeTo(::std::ostream* os) const override { impl_->DescribeTo(os); }
-
-  void DescribeNegationTo(::std::ostream* os) const override {
-    impl_->DescribeNegationTo(os);
-  }
-
-  bool MatchAndExplain(const T& x,
-                       MatchResultListener* listener) const override {
-    return impl_->MatchAndExplain(x, listener);
-  }
-
- private:
-  const MatcherInterface<T>* const impl_;
-
-  GTEST_DISALLOW_COPY_AND_ASSIGN_(MatcherInterfaceAdapter);
-};
-
 struct AnyEq {
   template <typename A, typename B>
   bool operator()(const A& a, const B& b) const { return a == b; }
@@ -252,16 +220,35 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamMatchResultListener);
 };
 
+struct SharedPayloadBase {
+  std::atomic<int> ref{1};
+  void Ref() { ref.fetch_add(1, std::memory_order_relaxed); }
+  bool Unref() { return ref.fetch_sub(1, std::memory_order_acq_rel) == 1; }
+};
+
+template <typename T>
+struct SharedPayload : SharedPayloadBase {
+  explicit SharedPayload(const T& v) : value(v) {}
+  explicit SharedPayload(T&& v) : value(std::move(v)) {}
+
+  static void Destroy(SharedPayloadBase* shared) {
+    delete static_cast<SharedPayload*>(shared);
+  }
+
+  T value;
+};
+
 // An internal class for implementing Matcher<T>, which will derive
 // from it.  We put functionalities common to all Matcher<T>
 // specializations here to avoid code duplication.
 template <typename T>
-class MatcherBase {
+class MatcherBase : private MatcherDescriberInterface {
  public:
   // Returns true if and only if the matcher matches x; also explains the
   // match result to 'listener'.
   bool MatchAndExplain(const T& x, MatchResultListener* listener) const {
-    return impl_->MatchAndExplain(x, listener);
+    GTEST_CHECK_(vtable_ != nullptr);
+    return vtable_->match_and_explain(*this, x, listener);
   }
 
   // Returns true if and only if this matcher matches x.
@@ -271,11 +258,15 @@
   }
 
   // Describes this matcher to an ostream.
-  void DescribeTo(::std::ostream* os) const { impl_->DescribeTo(os); }
+  void DescribeTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, false);
+  }
 
   // Describes the negation of this matcher to an ostream.
-  void DescribeNegationTo(::std::ostream* os) const {
-    impl_->DescribeNegationTo(os);
+  void DescribeNegationTo(::std::ostream* os) const final {
+    GTEST_CHECK_(vtable_ != nullptr);
+    vtable_->describe(*this, os, true);
   }
 
   // Explains why x matches, or doesn't match, the matcher.
@@ -288,31 +279,194 @@
   // of the describer, which is only guaranteed to be alive when
   // this matcher object is alive.
   const MatcherDescriberInterface* GetDescriber() const {
-    return impl_.get();
+    if (vtable_ == nullptr) return nullptr;
+    return vtable_->get_describer(*this);
   }
 
  protected:
-  MatcherBase() {}
+  MatcherBase() : vtable_(nullptr) {}
 
   // Constructs a matcher from its implementation.
-  explicit MatcherBase(const MatcherInterface<const T&>* impl) : impl_(impl) {}
-
   template <typename U>
-  explicit MatcherBase(
-      const MatcherInterface<U>* impl,
-      typename std::enable_if<!std::is_same<U, const U&>::value>::type* =
-          nullptr)
-      : impl_(new internal::MatcherInterfaceAdapter<U>(impl)) {}
+  explicit MatcherBase(const MatcherInterface<U>* impl) {
+    Init(impl);
+  }
 
-  MatcherBase(const MatcherBase&) = default;
-  MatcherBase& operator=(const MatcherBase&) = default;
-  MatcherBase(MatcherBase&&) = default;
-  MatcherBase& operator=(MatcherBase&&) = default;
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  MatcherBase(M&& m) {  // NOLINT
+    Init(std::forward<M>(m));
+  }
 
-  virtual ~MatcherBase() {}
+  MatcherBase(const MatcherBase& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    if (IsShared()) buffer_.shared->Ref();
+  }
+
+  MatcherBase& operator=(const MatcherBase& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    if (IsShared()) buffer_.shared->Ref();
+    return *this;
+  }
+
+  MatcherBase(MatcherBase&& other)
+      : vtable_(other.vtable_), buffer_(other.buffer_) {
+    other.vtable_ = nullptr;
+  }
+
+  MatcherBase& operator=(MatcherBase&& other) {
+    if (this == &other) return *this;
+    Destroy();
+    vtable_ = other.vtable_;
+    buffer_ = other.buffer_;
+    other.vtable_ = nullptr;
+    return *this;
+  }
+
+  ~MatcherBase() override { Destroy(); }
 
  private:
-  std::shared_ptr<const MatcherInterface<const T&>> impl_;
+  struct VTable {
+    bool (*match_and_explain)(const MatcherBase&, const T&,
+                              MatchResultListener*);
+    void (*describe)(const MatcherBase&, std::ostream*, bool negation);
+    // Returns the captured object if it implements the interface, otherwise
+    // returns the MatcherBase itself.
+    const MatcherDescriberInterface* (*get_describer)(const MatcherBase&);
+    // Called on shared instances when the reference count reaches 0.
+    void (*shared_destroy)(SharedPayloadBase*);
+  };
+
+  bool IsShared() const {
+    return vtable_ != nullptr && vtable_->shared_destroy != nullptr;
+  }
+
+  // If the implementation uses a listener, call that.
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener->stream())) {
+    return P::Get(m).MatchAndExplain(value, listener->stream());
+  }
+
+  template <typename P>
+  static auto MatchAndExplainImpl(const MatcherBase& m, const T& value,
+                                  MatchResultListener* listener)
+      -> decltype(P::Get(m).MatchAndExplain(value, listener)) {
+    return P::Get(m).MatchAndExplain(value, listener);
+  }
+
+  template <typename P>
+  static void DescribeImpl(const MatcherBase& m, std::ostream* os,
+                           bool negation) {
+    if (negation) {
+      P::Get(m).DescribeNegationTo(os);
+    } else {
+      P::Get(m).DescribeTo(os);
+    }
+  }
+
+  template <typename P>
+  static const MatcherDescriberInterface* GetDescriberImpl(
+      const MatcherBase& m) {
+    // If the impl is a MatcherDescriberInterface, then return it.
+    // Otherwise use MatcherBase itself.
+    // This allows us to implement the GetDescriber() function without support
+    // from the impl, but some users really want to get their impl back when
+    // they call GetDescriber().
+    // We use std::get on a tuple as a workaround of not having `if constexpr`.
+    return std::get<(
+        std::is_convertible<decltype(&P::Get(m)),
+                            const MatcherDescriberInterface*>::value
+            ? 1
+            : 0)>(std::make_tuple(&m, &P::Get(m)));
+  }
+
+  template <typename P>
+  const VTable* GetVTable() {
+    static constexpr VTable kVTable = {&MatchAndExplainImpl<P>,
+                                       &DescribeImpl<P>, &GetDescriberImpl<P>,
+                                       P::shared_destroy};
+    return &kVTable;
+  }
+
+  union Buffer {
+    // Add some types to give Buffer some common alignment/size use cases.
+    void* ptr;
+    double d;
+    int64_t i;
+    // And add one for the out-of-line cases.
+    SharedPayloadBase* shared;
+  };
+
+  void Destroy() {
+    if (IsShared() && buffer_.shared->Unref()) {
+      vtable_->shared_destroy(buffer_.shared);
+    }
+  }
+
+  template <typename M>
+  static constexpr bool IsInlined() {
+    return sizeof(M) <= sizeof(Buffer) && alignof(M) <= alignof(Buffer) &&
+           std::is_trivially_copy_constructible<M>::value &&
+           std::is_trivially_destructible<M>::value;
+  }
+
+  template <typename M, bool = MatcherBase::IsInlined<M>()>
+  struct ValuePolicy {
+    static const M& Get(const MatcherBase& m) {
+      // When inlined along with Init, need to be explicit to avoid violating
+      // strict aliasing rules.
+      const M *ptr = static_cast<const M*>(
+          static_cast<const void*>(&m.buffer_));
+      return *ptr;
+    }
+    static void Init(MatcherBase& m, M impl) {
+      ::new (static_cast<void*>(&m.buffer_)) M(impl);
+    }
+    static constexpr auto shared_destroy = nullptr;
+  };
+
+  template <typename M>
+  struct ValuePolicy<M, false> {
+    using Shared = SharedPayload<M>;
+    static const M& Get(const MatcherBase& m) {
+      return static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    template <typename Arg>
+    static void Init(MatcherBase& m, Arg&& arg) {
+      m.buffer_.shared = new Shared(std::forward<Arg>(arg));
+    }
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename U, bool B>
+  struct ValuePolicy<const MatcherInterface<U>*, B> {
+    using M = const MatcherInterface<U>;
+    using Shared = SharedPayload<std::unique_ptr<M>>;
+    static const M& Get(const MatcherBase& m) {
+      return *static_cast<Shared*>(m.buffer_.shared)->value;
+    }
+    static void Init(MatcherBase& m, M* impl) {
+      m.buffer_.shared = new Shared(std::unique_ptr<M>(impl));
+    }
+
+    static constexpr auto shared_destroy = &Shared::Destroy;
+  };
+
+  template <typename M>
+  void Init(M&& m) {
+    using MM = typename std::decay<M>::type;
+    using Policy = ValuePolicy<MM>;
+    vtable_ = GetVTable<Policy>();
+    Policy::Init(*this, std::forward<M>(m));
+  }
+
+  const VTable* vtable_;
+  Buffer buffer_;
 };
 
 }  // namespace internal
@@ -340,6 +494,10 @@
           nullptr)
       : internal::MatcherBase<T>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m) : internal::MatcherBase<T>(std::forward<M>(m)) {}  // NOLINT
+
   // Implicit constructor here allows people to write
   // EXPECT_CALL(foo, Bar(5)) instead of EXPECT_CALL(foo, Bar(Eq(5))) sometimes
   Matcher(T value);  // NOLINT
@@ -357,6 +515,11 @@
   explicit Matcher(const MatcherInterface<const std::string&>* impl)
       : internal::MatcherBase<const std::string&>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const std::string&>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -376,6 +539,11 @@
   explicit Matcher(const MatcherInterface<std::string>* impl)
       : internal::MatcherBase<std::string>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<std::string>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a string object.
   Matcher(const std::string& s);  // NOLINT
@@ -397,6 +565,12 @@
   explicit Matcher(const MatcherInterface<const internal::StringView&>* impl)
       : internal::MatcherBase<const internal::StringView&>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<const internal::StringView&>(std::forward<M>(m)) {
+  }
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -419,6 +593,11 @@
   explicit Matcher(const MatcherInterface<internal::StringView>* impl)
       : internal::MatcherBase<internal::StringView>(impl) {}
 
+  template <typename M, typename = typename std::remove_reference<
+                            M>::type::is_gtest_matcher>
+  Matcher(M&& m)  // NOLINT
+      : internal::MatcherBase<internal::StringView>(std::forward<M>(m)) {}
+
   // Allows the user to write str instead of Eq(str) sometimes, where
   // str is a std::string object.
   Matcher(const std::string& s);  // NOLINT
@@ -529,37 +708,32 @@
 class ComparisonBase {
  public:
   explicit ComparisonBase(const Rhs& rhs) : rhs_(rhs) {}
+
+  using is_gtest_matcher = void;
+
   template <typename Lhs>
-  operator Matcher<Lhs>() const {
-    return Matcher<Lhs>(new Impl<const Lhs&>(rhs_));
+  bool MatchAndExplain(const Lhs& lhs, std::ostream*) const {
+    return Op()(lhs, Unwrap(rhs_));
+  }
+  void DescribeTo(std::ostream* os) const {
+    *os << D::Desc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
+  }
+  void DescribeNegationTo(std::ostream* os) const {
+    *os << D::NegatedDesc() << " ";
+    UniversalPrint(Unwrap(rhs_), os);
   }
 
  private:
   template <typename T>
-  static const T& Unwrap(const T& v) { return v; }
+  static const T& Unwrap(const T& v) {
+    return v;
+  }
   template <typename T>
-  static const T& Unwrap(std::reference_wrapper<T> v) { return v; }
+  static const T& Unwrap(std::reference_wrapper<T> v) {
+    return v;
+  }
 
-  template <typename Lhs, typename = Rhs>
-  class Impl : public MatcherInterface<Lhs> {
-   public:
-    explicit Impl(const Rhs& rhs) : rhs_(rhs) {}
-    bool MatchAndExplain(Lhs lhs,
-                         MatchResultListener* /* listener */) const override {
-      return Op()(lhs, Unwrap(rhs_));
-    }
-    void DescribeTo(::std::ostream* os) const override {
-      *os << D::Desc() << " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-    void DescribeNegationTo(::std::ostream* os) const override {
-      *os << D::NegatedDesc() <<  " ";
-      UniversalPrint(Unwrap(rhs_), os);
-    }
-
-   private:
-    Rhs rhs_;
-  };
   Rhs rhs_;
 };
 
@@ -612,6 +786,10 @@
   static const char* NegatedDesc() { return "isn't >="; }
 };
 
+template <typename T, typename = typename std::enable_if<
+                          std::is_constructible<std::string, T>::value>::type>
+using StringLike = T;
+
 // Implements polymorphic matchers MatchesRegex(regex) and
 // ContainsRegex(regex), which can be used as a Matcher<T> as long as
 // T can be converted to a string.
@@ -672,9 +850,10 @@
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, true));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
-    const std::string& regex) {
-  return MatchesRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> MatchesRegex(
+    const internal::StringLike<T>& regex) {
+  return MatchesRegex(new internal::RE(std::string(regex)));
 }
 
 // Matches a string that contains regular expression 'regex'.
@@ -683,9 +862,10 @@
     const internal::RE* regex) {
   return MakePolymorphicMatcher(internal::MatchesRegexMatcher(regex, false));
 }
-inline PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
-    const std::string& regex) {
-  return ContainsRegex(new internal::RE(regex));
+template <typename T = std::string>
+PolymorphicMatcher<internal::MatchesRegexMatcher> ContainsRegex(
+    const internal::StringLike<T>& regex) {
+  return ContainsRegex(new internal::RE(std::string(regex)));
 }
 
 // Creates a polymorphic matcher that matches anything equal to x.
@@ -747,4 +927,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251 5046
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MATCHERS_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
index 2189923..becfd49 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-message.h
@@ -44,8 +44,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
-#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
 
 #include <limits>
 #include <memory>
@@ -216,4 +216,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_MESSAGE_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
index 5b039df..804e702 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-param-test.h
@@ -30,12 +30,9 @@
 // Macros and functions for implementing parameterized tests
 // in Google C++ Testing and Mocking Framework (Google Test)
 //
-// This file is generated by a SCRIPT.  DO NOT EDIT BY HAND!
-//
 // GOOGLETEST_CM0001 DO NOT DELETE
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
-
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
 
 // Value-parameterized tests allow you to test your code with different
 // parameters without writing multiple copies of the same test.
@@ -371,8 +368,6 @@
 //     std::tuple<T1, T2, ..., TN> where T1, T2, ..., TN are the types
 //     of elements from sequences produces by gen1, gen2, ..., genN.
 //
-// Combine can have up to 10 arguments.
-//
 // Example:
 //
 // This will instantiate tests in test suite AnimalTest each one with
@@ -428,7 +423,8 @@
           ->AddTestPattern(                                                    \
               GTEST_STRINGIFY_(test_suite_name), GTEST_STRINGIFY_(test_name),  \
               new ::testing::internal::TestMetaFactory<GTEST_TEST_CLASS_NAME_( \
-                  test_suite_name, test_name)>());                             \
+                  test_suite_name, test_name)>(),                              \
+              ::testing::internal::CodeLocation(__FILE__, __LINE__));          \
       return 0;                                                                \
     }                                                                          \
     static int gtest_registering_dummy_ GTEST_ATTRIBUTE_UNUSED_;               \
@@ -508,4 +504,4 @@
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
index 407d1f1..076c9de 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-printers.h
@@ -97,10 +97,11 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
 
 #include <functional>
+#include <memory>
 #include <ostream>  // NOLINT
 #include <sstream>
 #include <string>
@@ -108,64 +109,124 @@
 #include <type_traits>
 #include <utility>
 #include <vector>
+
 #include "gtest/internal/gtest-internal.h"
 #include "gtest/internal/gtest-port.h"
 
-#if GTEST_HAS_ABSL
-#include "absl/strings/string_view.h"
-#include "absl/types/optional.h"
-#include "absl/types/variant.h"
-#endif  // GTEST_HAS_ABSL
-
 namespace testing {
 
-// Definitions in the 'internal' and 'internal2' name spaces are
-// subject to change without notice.  DO NOT USE THEM IN USER CODE!
-namespace internal2 {
+// Definitions in the internal* namespaces are subject to change without notice.
+// DO NOT USE THEM IN USER CODE!
+namespace internal {
 
-// Prints the given number of bytes in the given object to the given
-// ostream.
-GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
-                                     size_t count,
-                                     ::std::ostream* os);
+template <typename T>
+void UniversalPrint(const T& value, ::std::ostream* os);
 
-// For selecting which printer to use when a given type has neither <<
-// nor PrintTo().
-enum TypeKind {
-  kProtobuf,              // a protobuf type
-  kConvertibleToInteger,  // a type implicitly convertible to BiggestInt
-                          // (e.g. a named or unnamed enum type)
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-  kConvertibleToStringView,  // a type implicitly convertible to
-                             // absl::string_view or std::string_view
-#endif
-  kOtherType  // anything else
-};
+// Used to print an STL-style container when the user doesn't define
+// a PrintTo() for it.
+struct ContainerPrinter {
+  template <typename T,
+            typename = typename std::enable_if<
+                (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
+                !IsRecursiveContainer<T>::value>::type>
+  static void PrintValue(const T& container, std::ostream* os) {
+    const size_t kMaxCount = 32;  // The maximum number of elements to print.
+    *os << '{';
+    size_t count = 0;
+    for (auto&& elem : container) {
+      if (count > 0) {
+        *os << ',';
+        if (count == kMaxCount) {  // Enough has been printed.
+          *os << " ...";
+          break;
+        }
+      }
+      *os << ' ';
+      // We cannot call PrintTo(elem, os) here as PrintTo() doesn't
+      // handle `elem` being a native array.
+      internal::UniversalPrint(elem, os);
+      ++count;
+    }
 
-// TypeWithoutFormatter<T, kTypeKind>::PrintValue(value, os) is called
-// by the universal printer to print a value of type T when neither
-// operator<< nor PrintTo() is defined for T, where kTypeKind is the
-// "kind" of T as defined by enum TypeKind.
-template <typename T, TypeKind kTypeKind>
-class TypeWithoutFormatter {
- public:
-  // This default version is called when kTypeKind is kOtherType.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    PrintBytesInObjectTo(
-        static_cast<const unsigned char*>(
-            reinterpret_cast<const void*>(std::addressof(value))),
-        sizeof(value), os);
+    if (count > 0) {
+      *os << ' ';
+    }
+    *os << '}';
   }
 };
 
-// We print a protobuf using its ShortDebugString() when the string
-// doesn't exceed this many characters; otherwise we print it using
-// DebugString() for better readability.
-const size_t kProtobufOneLinerMaxLength = 50;
+// Used to print a pointer that is neither a char pointer nor a member
+// pointer, when the user doesn't define PrintTo() for it.  (A member
+// variable pointer or member function pointer doesn't really point to
+// a location in the address space.  Their representation is
+// implementation-defined.  Therefore they will be printed as raw
+// bytes.)
+struct FunctionPointerPrinter {
+  template <typename T, typename = typename std::enable_if<
+                            std::is_function<T>::value>::type>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is a function type, so '*os << p' doesn't do what we want
+      // (it just prints p as bool).  We want to print p as a const
+      // void*.
+      *os << reinterpret_cast<const void*>(p);
+    }
+  }
+};
 
-template <typename T>
-class TypeWithoutFormatter<T, kProtobuf> {
- public:
+struct PointerPrinter {
+  template <typename T>
+  static void PrintValue(T* p, ::std::ostream* os) {
+    if (p == nullptr) {
+      *os << "NULL";
+    } else {
+      // T is not a function type.  We just call << to print p,
+      // relying on ADL to pick up user-defined << for their pointer
+      // types, if any.
+      *os << p;
+    }
+  }
+};
+
+namespace internal_stream_operator_without_lexical_name_lookup {
+
+// The presence of an operator<< here will terminate lexical scope lookup
+// straight away (even though it cannot be a match because of its argument
+// types). Thus, the two operator<< calls in StreamPrinter will find only ADL
+// candidates.
+struct LookupBlocker {};
+void operator<<(LookupBlocker, LookupBlocker);
+
+struct StreamPrinter {
+  template <typename T,
+            // Don't accept member pointers here. We'd print them via implicit
+            // conversion to bool, which isn't useful.
+            typename = typename std::enable_if<
+                !std::is_member_pointer<T>::value>::type,
+            // Only accept types for which we can find a streaming operator via
+            // ADL (possibly involving implicit conversions).
+            typename = decltype(std::declval<std::ostream&>()
+                                << std::declval<const T&>())>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    // Call streaming operator found by ADL, possibly with implicit conversions
+    // of the arguments.
+    *os << value;
+  }
+};
+
+}  // namespace internal_stream_operator_without_lexical_name_lookup
+
+struct ProtobufPrinter {
+  // We print a protobuf using its ShortDebugString() when the string
+  // doesn't exceed this many characters; otherwise we print it using
+  // DebugString() for better readability.
+  static const size_t kProtobufOneLinerMaxLength = 50;
+
+  template <typename T,
+            typename = typename std::enable_if<
+                internal::HasDebugStringAndShortDebugString<T>::value>::type>
   static void PrintValue(const T& value, ::std::ostream* os) {
     std::string pretty_str = value.ShortDebugString();
     if (pretty_str.length() > kProtobufOneLinerMaxLength) {
@@ -175,9 +236,7 @@
   }
 };
 
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToInteger> {
- public:
+struct ConvertibleToIntegerPrinter {
   // Since T has no << operator or PrintTo() but can be implicitly
   // converted to BiggestInt, we print it as a BiggestInt.
   //
@@ -185,112 +244,74 @@
   // case printing it as an integer is the desired behavior.  In case
   // T is not an enum, printing it as an integer is the best we can do
   // given that it has no user-defined printer.
-  static void PrintValue(const T& value, ::std::ostream* os) {
-    const internal::BiggestInt kBigInt = value;
-    *os << kBigInt;
+  static void PrintValue(internal::BiggestInt value, ::std::ostream* os) {
+    *os << value;
   }
 };
 
+struct ConvertibleToStringViewPrinter {
 #if GTEST_INTERNAL_HAS_STRING_VIEW
-template <typename T>
-class TypeWithoutFormatter<T, kConvertibleToStringView> {
- public:
-  // Since T has neither operator<< nor PrintTo() but can be implicitly
-  // converted to absl::string_view, we print it as a absl::string_view
-  // (or std::string_view).
-  //
-  // Note: the implementation is further below, as it depends on
-  // internal::PrintTo symbol which is defined later in the file.
-  static void PrintValue(const T& value, ::std::ostream* os);
+  static void PrintValue(internal::StringView value, ::std::ostream* os) {
+    internal::UniversalPrint(value, os);
+  }
+#endif
 };
-#endif
 
-// Prints the given value to the given ostream.  If the value is a
-// protocol message, its debug string is printed; if it's an enum or
-// of a type implicitly convertible to BiggestInt, it's printed as an
-// integer; otherwise the bytes in the value are printed.  This is
-// what UniversalPrinter<T>::Print() does when it knows nothing about
-// type T and T has neither << operator nor PrintTo().
-//
-// A user can override this behavior for a class type Foo by defining
-// a << operator in the namespace where Foo is defined.
-//
-// We put this operator in namespace 'internal2' instead of 'internal'
-// to simplify the implementation, as much code in 'internal' needs to
-// use << in STL, which would conflict with our own << were it defined
-// in 'internal'.
-//
-// Note that this operator<< takes a generic std::basic_ostream<Char,
-// CharTraits> type instead of the more restricted std::ostream.  If
-// we define it to take an std::ostream instead, we'll get an
-// "ambiguous overloads" compiler error when trying to print a type
-// Foo that supports streaming to std::basic_ostream<Char,
-// CharTraits>, as the compiler cannot tell whether
-// operator<<(std::ostream&, const T&) or
-// operator<<(std::basic_stream<Char, CharTraits>, const Foo&) is more
-// specific.
-template <typename Char, typename CharTraits, typename T>
-::std::basic_ostream<Char, CharTraits>& operator<<(
-    ::std::basic_ostream<Char, CharTraits>& os, const T& x) {
-  TypeWithoutFormatter<T, (internal::IsAProtocolMessage<T>::value
-                               ? kProtobuf
-                               : std::is_convertible<
-                                     const T&, internal::BiggestInt>::value
-                                     ? kConvertibleToInteger
-                                     :
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-                                     std::is_convertible<
-                                         const T&, internal::StringView>::value
-                                         ? kConvertibleToStringView
-                                         :
-#endif
-                                         kOtherType)>::PrintValue(x, &os);
-  return os;
-}
 
-}  // namespace internal2
-}  // namespace testing
+// Prints the given number of bytes in the given object to the given
+// ostream.
+GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes,
+                                     size_t count,
+                                     ::std::ostream* os);
+struct RawBytesPrinter {
+  // SFINAE on `sizeof` to make sure we have a complete type.
+  template <typename T, size_t = sizeof(T)>
+  static void PrintValue(const T& value, ::std::ostream* os) {
+    PrintBytesInObjectTo(
+        static_cast<const unsigned char*>(
+            // Load bearing cast to void* to support iOS
+            reinterpret_cast<const void*>(std::addressof(value))),
+        sizeof(value), os);
+  }
+};
 
-// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up
-// magic needed for implementing UniversalPrinter won't work.
-namespace testing_internal {
+struct FallbackPrinter {
+  template <typename T>
+  static void PrintValue(const T&, ::std::ostream* os) {
+    *os << "(incomplete type)";
+  }
+};
 
-// Used to print a value that is not an STL-style container when the
-// user doesn't define PrintTo() for it.
+// Try every printer in order and return the first one that works.
+template <typename T, typename E, typename Printer, typename... Printers>
+struct FindFirstPrinter : FindFirstPrinter<T, E, Printers...> {};
+
+template <typename T, typename Printer, typename... Printers>
+struct FindFirstPrinter<
+    T, decltype(Printer::PrintValue(std::declval<const T&>(), nullptr)),
+    Printer, Printers...> {
+  using type = Printer;
+};
+
+// Select the best printer in the following order:
+//  - Print containers (they have begin/end/etc).
+//  - Print function pointers.
+//  - Print object pointers.
+//  - Use the stream operator, if available.
+//  - Print protocol buffers.
+//  - Print types convertible to BiggestInt.
+//  - Print types convertible to StringView, if available.
+//  - Fallback to printing the raw bytes of the object.
 template <typename T>
-void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) {
-  // With the following statement, during unqualified name lookup,
-  // testing::internal2::operator<< appears as if it was declared in
-  // the nearest enclosing namespace that contains both
-  // ::testing_internal and ::testing::internal2, i.e. the global
-  // namespace.  For more details, refer to the C++ Standard section
-  // 7.3.4-1 [namespace.udir].  This allows us to fall back onto
-  // testing::internal2::operator<< in case T doesn't come with a <<
-  // operator.
-
-  using ::testing::internal2::operator<<;
-
-  // Assuming T is defined in namespace foo, in the next statement,
-  // the compiler will consider all of:
-  //
-  //   1. foo::operator<< (thanks to Koenig look-up),
-  //   2. ::operator<< (as the current namespace is enclosed in ::),
-  //   3. testing::internal2::operator<< (thanks to the using statement above).
-  //
-  // The operator<< whose type matches T best will be picked.
-  //
-  // We deliberately allow #2 to be a candidate, as sometimes it's
-  // impossible to define #1 (e.g. when foo is ::std, defining
-  // anything in it is undefined behavior unless you are a compiler
-  // vendor.).
-  *os << value;
+void PrintWithFallback(const T& value, ::std::ostream* os) {
+  using Printer = typename FindFirstPrinter<
+      T, void, ContainerPrinter, FunctionPointerPrinter, PointerPrinter,
+      internal_stream_operator_without_lexical_name_lookup::StreamPrinter,
+      ProtobufPrinter, ConvertibleToIntegerPrinter,
+      ConvertibleToStringViewPrinter, RawBytesPrinter, FallbackPrinter>::type;
+  Printer::PrintValue(value, os);
 }
 
-}  // namespace testing_internal
-
-namespace testing {
-namespace internal {
-
 // FormatForComparison<ToPrint, OtherOperand>::Format(value) formats a
 // value of type ToPrint that is an operand of a comparison assertion
 // (e.g. ASSERT_EQ).  OtherOperand is the type of the other operand in
@@ -339,6 +360,14 @@
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(wchar_t);
 GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const wchar_t);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char8_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char8_t);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char16_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(char32_t);
+GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_(const char32_t);
 
 #undef GTEST_IMPL_FORMAT_C_STRING_AS_POINTER_
 
@@ -356,6 +385,14 @@
 
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char, ::std::string);
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char, ::std::string);
+#ifdef __cpp_char8_t
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char8_t, ::std::u8string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char8_t, ::std::u8string);
+#endif
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char16_t, ::std::u16string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(char32_t, ::std::u32string);
+GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(const char32_t, ::std::u32string);
 
 #if GTEST_HAS_STD_WSTRING
 GTEST_IMPL_FORMAT_C_STRING_AS_STRING_(wchar_t, ::std::wstring);
@@ -388,85 +425,6 @@
 template <typename T>
 class UniversalPrinter;
 
-template <typename T>
-void UniversalPrint(const T& value, ::std::ostream* os);
-
-enum DefaultPrinterType {
-  kPrintContainer,
-  kPrintPointer,
-  kPrintFunctionPointer,
-  kPrintOther,
-};
-template <DefaultPrinterType type> struct WrapPrinterType {};
-
-// Used to print an STL-style container when the user doesn't define
-// a PrintTo() for it.
-template <typename C>
-void DefaultPrintTo(WrapPrinterType<kPrintContainer> /* dummy */,
-                    const C& container, ::std::ostream* os) {
-  const size_t kMaxCount = 32;  // The maximum number of elements to print.
-  *os << '{';
-  size_t count = 0;
-  for (typename C::const_iterator it = container.begin();
-       it != container.end(); ++it, ++count) {
-    if (count > 0) {
-      *os << ',';
-      if (count == kMaxCount) {  // Enough has been printed.
-        *os << " ...";
-        break;
-      }
-    }
-    *os << ' ';
-    // We cannot call PrintTo(*it, os) here as PrintTo() doesn't
-    // handle *it being a native array.
-    internal::UniversalPrint(*it, os);
-  }
-
-  if (count > 0) {
-    *os << ' ';
-  }
-  *os << '}';
-}
-
-// Used to print a pointer that is neither a char pointer nor a member
-// pointer, when the user doesn't define PrintTo() for it.  (A member
-// variable pointer or member function pointer doesn't really point to
-// a location in the address space.  Their representation is
-// implementation-defined.  Therefore they will be printed as raw
-// bytes.)
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is not a function type.  We just call << to print p,
-    // relying on ADL to pick up user-defined << for their pointer
-    // types, if any.
-    *os << p;
-  }
-}
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintFunctionPointer> /* dummy */,
-                    T* p, ::std::ostream* os) {
-  if (p == nullptr) {
-    *os << "NULL";
-  } else {
-    // T is a function type, so '*os << p' doesn't do what we want
-    // (it just prints p as bool).  We want to print p as a const
-    // void*.
-    *os << reinterpret_cast<const void*>(p);
-  }
-}
-
-// Used to print a non-container, non-pointer value when the user
-// doesn't define PrintTo() for it.
-template <typename T>
-void DefaultPrintTo(WrapPrinterType<kPrintOther> /* dummy */,
-                    const T& value, ::std::ostream* os) {
-  ::testing_internal::DefaultPrintNonContainerTo(value, os);
-}
-
 // Prints the given value using the << operator if it has one;
 // otherwise prints the bytes in it.  This is what
 // UniversalPrinter<T>::Print() does when PrintTo() is not specialized
@@ -480,36 +438,7 @@
 // wants).
 template <typename T>
 void PrintTo(const T& value, ::std::ostream* os) {
-  // DefaultPrintTo() is overloaded.  The type of its first argument
-  // determines which version will be picked.
-  //
-  // Note that we check for container types here, prior to we check
-  // for protocol message types in our operator<<.  The rationale is:
-  //
-  // For protocol messages, we want to give people a chance to
-  // override Google Mock's format by defining a PrintTo() or
-  // operator<<.  For STL containers, other formats can be
-  // incompatible with Google Mock's format for the container
-  // elements; therefore we check for container types here to ensure
-  // that our format is used.
-  //
-  // Note that MSVC and clang-cl do allow an implicit conversion from
-  // pointer-to-function to pointer-to-object, but clang-cl warns on it.
-  // So don't use ImplicitlyConvertible if it can be helped since it will
-  // cause this warning, and use a separate overload of DefaultPrintTo for
-  // function pointers so that the `*os << p` in the object pointer overload
-  // doesn't cause that warning either.
-  DefaultPrintTo(
-      WrapPrinterType <
-                  (sizeof(IsContainerTest<T>(0)) == sizeof(IsContainer)) &&
-              !IsRecursiveContainer<T>::value
-          ? kPrintContainer
-          : !std::is_pointer<T>::value
-                ? kPrintOther
-                : std::is_function<typename std::remove_pointer<T>::type>::value
-                      ? kPrintFunctionPointer
-                      : kPrintPointer > (),
-      value, os);
+  internal::PrintWithFallback(value, os);
 }
 
 // The following list of PrintTo() overloads tells
@@ -540,6 +469,16 @@
 // is implemented as an unsigned type.
 GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os);
 
+GTEST_API_ void PrintTo(char32_t c, ::std::ostream* os);
+inline void PrintTo(char16_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#ifdef __cpp_char8_t
+inline void PrintTo(char8_t c, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<char32_t>(c), os);
+}
+#endif
+
 // Overloads for C strings.
 GTEST_API_ void PrintTo(const char* s, ::std::ostream* os);
 inline void PrintTo(char* s, ::std::ostream* os) {
@@ -560,6 +499,23 @@
 inline void PrintTo(unsigned char* s, ::std::ostream* os) {
   PrintTo(ImplicitCast_<const void*>(s), os);
 }
+#ifdef __cpp_char8_t
+// Overloads for u8 strings.
+GTEST_API_ void PrintTo(const char8_t* s, ::std::ostream* os);
+inline void PrintTo(char8_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char8_t*>(s), os);
+}
+#endif
+// Overloads for u16 strings.
+GTEST_API_ void PrintTo(const char16_t* s, ::std::ostream* os);
+inline void PrintTo(char16_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char16_t*>(s), os);
+}
+// Overloads for u32 strings.
+GTEST_API_ void PrintTo(const char32_t* s, ::std::ostream* os);
+inline void PrintTo(char32_t* s, ::std::ostream* os) {
+  PrintTo(ImplicitCast_<const char32_t*>(s), os);
+}
 
 // MSVC can be configured to define wchar_t as a typedef of unsigned
 // short.  It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native
@@ -594,6 +550,26 @@
   PrintStringTo(s, os);
 }
 
+// Overloads for ::std::u8string
+#ifdef __cpp_char8_t
+GTEST_API_ void PrintU8StringTo(const ::std::u8string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u8string& s, ::std::ostream* os) {
+  PrintU8StringTo(s, os);
+}
+#endif
+
+// Overloads for ::std::u16string
+GTEST_API_ void PrintU16StringTo(const ::std::u16string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u16string& s, ::std::ostream* os) {
+  PrintU16StringTo(s, os);
+}
+
+// Overloads for ::std::u32string
+GTEST_API_ void PrintU32StringTo(const ::std::u32string& s, ::std::ostream* os);
+inline void PrintTo(const ::std::u32string& s, ::std::ostream* os) {
+  PrintU32StringTo(s, os);
+}
+
 // Overloads for ::std::wstring.
 #if GTEST_HAS_STD_WSTRING
 GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os);
@@ -616,6 +592,43 @@
   UniversalPrinter<T&>::Print(ref.get(), os);
 }
 
+inline const void* VoidifyPointer(const void* p) { return p; }
+inline const void* VoidifyPointer(volatile const void* p) {
+  return const_cast<const void*>(p);
+}
+
+template <typename T, typename Ptr>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, char) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    // We can't print the value. Just print the pointer..
+    *os << "(" << (VoidifyPointer)(ptr.get()) << ")";
+  }
+}
+template <typename T, typename Ptr,
+          typename = typename std::enable_if<!std::is_void<T>::value &&
+                                             !std::is_array<T>::value>::type>
+void PrintSmartPointer(const Ptr& ptr, std::ostream* os, int) {
+  if (ptr == nullptr) {
+    *os << "(nullptr)";
+  } else {
+    *os << "(ptr = " << (VoidifyPointer)(ptr.get()) << ", value = ";
+    UniversalPrinter<T>::Print(*ptr, os);
+    *os << ")";
+  }
+}
+
+template <typename T, typename D>
+void PrintTo(const std::unique_ptr<T, D>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
+template <typename T>
+void PrintTo(const std::shared_ptr<T>& ptr, std::ostream* os) {
+  (PrintSmartPointer<T>)(ptr, os, 0);
+}
+
 // Helper function for printing a tuple.  T must be instantiated with
 // a tuple type.
 template <typename T>
@@ -681,14 +694,46 @@
   GTEST_DISABLE_MSC_WARNINGS_POP_()
 };
 
-#if GTEST_HAS_ABSL
+// Remove any const-qualifiers before passing a type to UniversalPrinter.
+template <typename T>
+class UniversalPrinter<const T> : public UniversalPrinter<T> {};
 
-// Printer for absl::optional
+#if GTEST_INTERNAL_HAS_ANY
+
+// Printer for std::any / absl::any
+
+template <>
+class UniversalPrinter<Any> {
+ public:
+  static void Print(const Any& value, ::std::ostream* os) {
+    if (value.has_value()) {
+      *os << "value of type " << GetTypeName(value);
+    } else {
+      *os << "no value";
+    }
+  }
+
+ private:
+  static std::string GetTypeName(const Any& value) {
+#if GTEST_HAS_RTTI
+    return internal::GetTypeName(value.type());
+#else
+    static_cast<void>(value);  // possibly unused
+    return "<unknown_type>";
+#endif  // GTEST_HAS_RTTI
+  }
+};
+
+#endif  // GTEST_INTERNAL_HAS_ANY
+
+#if GTEST_INTERNAL_HAS_OPTIONAL
+
+// Printer for std::optional / absl::optional
 
 template <typename T>
-class UniversalPrinter<::absl::optional<T>> {
+class UniversalPrinter<Optional<T>> {
  public:
-  static void Print(const ::absl::optional<T>& value, ::std::ostream* os) {
+  static void Print(const Optional<T>& value, ::std::ostream* os) {
     *os << '(';
     if (!value) {
       *os << "nullopt";
@@ -699,14 +744,22 @@
   }
 };
 
-// Printer for absl::variant
+#endif  // GTEST_INTERNAL_HAS_OPTIONAL
+
+#if GTEST_INTERNAL_HAS_VARIANT
+
+// Printer for std::variant / absl::variant
 
 template <typename... T>
-class UniversalPrinter<::absl::variant<T...>> {
+class UniversalPrinter<Variant<T...>> {
  public:
-  static void Print(const ::absl::variant<T...>& value, ::std::ostream* os) {
+  static void Print(const Variant<T...>& value, ::std::ostream* os) {
     *os << '(';
-    absl::visit(Visitor{os}, value);
+#if GTEST_HAS_ABSL
+    absl::visit(Visitor{os, value.index()}, value);
+#else
+    std::visit(Visitor{os, value.index()}, value);
+#endif  // GTEST_HAS_ABSL
     *os << ')';
   }
 
@@ -714,14 +767,16 @@
   struct Visitor {
     template <typename U>
     void operator()(const U& u) const {
-      *os << "'" << GetTypeName<U>() << "' with value ";
+      *os << "'" << GetTypeName<U>() << "(index = " << index
+          << ")' with value ";
       UniversalPrint(u, os);
     }
     ::std::ostream* os;
+    std::size_t index;
   };
 };
 
-#endif  // GTEST_HAS_ABSL
+#endif  // GTEST_INTERNAL_HAS_VARIANT
 
 // UniversalPrintArray(begin, len, os) prints an array of 'len'
 // elements, starting at address 'begin'.
@@ -750,6 +805,20 @@
 GTEST_API_ void UniversalPrintArray(
     const char* begin, size_t len, ::std::ostream* os);
 
+#ifdef __cpp_char8_t
+// This overload prints a (const) char8_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char8_t* begin, size_t len,
+                                    ::std::ostream* os);
+#endif
+
+// This overload prints a (const) char16_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char16_t* begin, size_t len,
+                                    ::std::ostream* os);
+
+// This overload prints a (const) char32_t array compactly.
+GTEST_API_ void UniversalPrintArray(const char32_t* begin, size_t len,
+                                    ::std::ostream* os);
+
 // This overload prints a (const) wchar_t array compactly.
 GTEST_API_ void UniversalPrintArray(
     const wchar_t* begin, size_t len, ::std::ostream* os);
@@ -822,12 +891,55 @@
   }
 };
 template <>
-class UniversalTersePrinter<char*> {
+class UniversalTersePrinter<char*> : public UniversalTersePrinter<const char*> {
+};
+
+#ifdef __cpp_char8_t
+template <>
+class UniversalTersePrinter<const char8_t*> {
  public:
-  static void Print(char* str, ::std::ostream* os) {
-    UniversalTersePrinter<const char*>::Print(str, os);
+  static void Print(const char8_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u8string(str), os);
+    }
   }
 };
+template <>
+class UniversalTersePrinter<char8_t*>
+    : public UniversalTersePrinter<const char8_t*> {};
+#endif
+
+template <>
+class UniversalTersePrinter<const char16_t*> {
+ public:
+  static void Print(const char16_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u16string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char16_t*>
+    : public UniversalTersePrinter<const char16_t*> {};
+
+template <>
+class UniversalTersePrinter<const char32_t*> {
+ public:
+  static void Print(const char32_t* str, ::std::ostream* os) {
+    if (str == nullptr) {
+      *os << "NULL";
+    } else {
+      UniversalPrint(::std::u32string(str), os);
+    }
+  }
+};
+template <>
+class UniversalTersePrinter<char32_t*>
+    : public UniversalTersePrinter<const char32_t*> {};
 
 #if GTEST_HAS_STD_WSTRING
 template <>
@@ -900,16 +1012,6 @@
 
 }  // namespace internal
 
-#if GTEST_INTERNAL_HAS_STRING_VIEW
-namespace internal2 {
-template <typename T>
-void TypeWithoutFormatter<T, kConvertibleToStringView>::PrintValue(
-    const T& value, ::std::ostream* os) {
-  internal::PrintTo(internal::StringView(value), os);
-}
-}  // namespace internal2
-#endif
-
 template <typename T>
 ::std::string PrintToString(const T& value) {
   ::std::stringstream ss;
@@ -924,4 +1026,4 @@
 // declarations from this file.
 #include "gtest/internal/custom/gtest-printers.h"
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRINTERS_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
index aa38870..eacef44 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-spi.h
@@ -33,8 +33,8 @@
 
 // GOOGLETEST_CM0004 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_
-#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
 
 #include "gtest/gtest.h"
 
@@ -235,4 +235,4 @@
     }\
   } while (::testing::internal::AlwaysFalse())
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_SPI_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_SPI_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
index 05a7985..203fdf9 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-test-part.h
@@ -29,8 +29,8 @@
 //
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
 
 #include <iosfwd>
 #include <vector>
@@ -181,4 +181,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TEST_PART_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
index 3ffa50b..9fdc6be 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest-typed-test.h
@@ -29,8 +29,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
 
 // This header implements typed tests and type-parameterized tests.
 
@@ -175,8 +175,6 @@
 
 // Implements typed tests.
 
-#if GTEST_HAS_TYPED_TEST
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the name of the typedef for the type parameters of the
@@ -230,12 +228,8 @@
   TYPED_TEST_SUITE
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST
-
 // Implements type-parameterized tests.
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE.
 //
 // Expands to the namespace name that the type-parameterized tests for
@@ -332,6 +326,4 @@
   INSTANTIATE_TYPED_TEST_SUITE_P
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
-#endif  // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest.h b/libvpx/third_party/googletest/src/include/gtest/gtest.h
index 39cff08..7a5d057 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest.h
@@ -49,8 +49,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_H_
-#define GTEST_INCLUDE_GTEST_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_H_
 
 #include <cstddef>
 #include <limits>
@@ -101,6 +101,10 @@
 // to let Google Test decide.
 GTEST_DECLARE_string_(color);
 
+// This flag controls whether the test runner should continue execution past
+// first failure.
+GTEST_DECLARE_bool_(fail_fast);
+
 // This flag sets up the filter to select by name using a glob pattern
 // the tests to run. If the filter is not given all tests are executed.
 GTEST_DECLARE_string_(filter);
@@ -117,6 +121,9 @@
 // in addition to its normal textual output.
 GTEST_DECLARE_string_(output);
 
+// This flags control whether Google Test prints only test failures.
+GTEST_DECLARE_bool_(brief);
+
 // This flags control whether Google Test prints the elapsed time for each
 // test.
 GTEST_DECLARE_bool_(print_time);
@@ -411,10 +418,10 @@
   // The d'tor is virtual as we intend to inherit from Test.
   virtual ~Test();
 
-  // Sets up the stuff shared by all tests in this test case.
+  // Sets up the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::SetUpTestSuite() before running the first
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // SetUpTestSuite() method to shadow the one defined in the super
   // class.
   static void SetUpTestSuite() {}
@@ -422,12 +429,13 @@
   // Tears down the stuff shared by all tests in this test suite.
   //
   // Google Test will call Foo::TearDownTestSuite() after running the last
-  // test in test case Foo.  Hence a sub-class can define its own
+  // test in test suite Foo.  Hence a sub-class can define its own
   // TearDownTestSuite() method to shadow the one defined in the super
   // class.
   static void TearDownTestSuite() {}
 
-  // Legacy API is deprecated but still available
+  // Legacy API is deprecated but still available. Use SetUpTestSuite and
+  // TearDownTestSuite instead.
 #ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   static void TearDownTestCase() {}
   static void SetUpTestCase() {}
@@ -665,7 +673,7 @@
 
   // Protects mutable state of the property vector and of owned
   // properties, whose values may be updated.
-  internal::Mutex test_properites_mutex_;
+  internal::Mutex test_properties_mutex_;
 
   // The vector of TestPartResults
   std::vector<TestPartResult> test_part_results_;
@@ -795,6 +803,9 @@
   // deletes it.
   void Run();
 
+  // Skip and records the test result for this object.
+  void Skip();
+
   static void ClearTestResult(TestInfo* test_info) {
     test_info->result_.Clear();
   }
@@ -943,6 +954,9 @@
   // Runs every test in this TestSuite.
   void Run();
 
+  // Skips the execution of tests under this TestSuite
+  void Skip();
+
   // Runs SetUpTestSuite() for this TestSuite.  This wrapper is needed
   // for catching exceptions thrown from SetUpTestSuite().
   void RunSetUpTestSuite() {
@@ -1535,14 +1549,6 @@
   return CmpHelperEQFailure(lhs_expression, rhs_expression, lhs, rhs);
 }
 
-// With this overloaded version, we allow anonymous enums to be used
-// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums
-// can be implicitly cast to BiggestInt.
-GTEST_API_ AssertionResult CmpHelperEQ(const char* lhs_expression,
-                                       const char* rhs_expression,
-                                       BiggestInt lhs,
-                                       BiggestInt rhs);
-
 class EqHelper {
  public:
   // This templatized version is for the general case.
@@ -1599,11 +1605,6 @@
 // ASSERT_?? and EXPECT_??.  It is here just to avoid copy-and-paste
 // of similar code.
 //
-// For each templatized helper function, we also define an overloaded
-// version for BiggestInt in order to reduce code bloat and allow
-// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled
-// with gcc 4.
-//
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 #define GTEST_IMPL_CMP_HELPER_(op_name, op)\
@@ -1615,22 +1616,20 @@
   } else {\
     return CmpHelperOpFailure(expr1, expr2, val1, val2, #op);\
   }\
-}\
-GTEST_API_ AssertionResult CmpHelper##op_name(\
-    const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2)
+}
 
 // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM.
 
 // Implements the helper function for {ASSERT|EXPECT}_NE
-GTEST_IMPL_CMP_HELPER_(NE, !=);
+GTEST_IMPL_CMP_HELPER_(NE, !=)
 // Implements the helper function for {ASSERT|EXPECT}_LE
-GTEST_IMPL_CMP_HELPER_(LE, <=);
+GTEST_IMPL_CMP_HELPER_(LE, <=)
 // Implements the helper function for {ASSERT|EXPECT}_LT
-GTEST_IMPL_CMP_HELPER_(LT, <);
+GTEST_IMPL_CMP_HELPER_(LT, <)
 // Implements the helper function for {ASSERT|EXPECT}_GE
-GTEST_IMPL_CMP_HELPER_(GE, >=);
+GTEST_IMPL_CMP_HELPER_(GE, >=)
 // Implements the helper function for {ASSERT|EXPECT}_GT
-GTEST_IMPL_CMP_HELPER_(GT, >);
+GTEST_IMPL_CMP_HELPER_(GT, >)
 
 #undef GTEST_IMPL_CMP_HELPER_
 
@@ -1807,12 +1806,6 @@
   GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper);
 };
 
-enum GTestColor { COLOR_DEFAULT, COLOR_RED, COLOR_GREEN, COLOR_YELLOW };
-
-GTEST_API_ GTEST_ATTRIBUTE_PRINTF_(2, 3) void ColoredPrintf(GTestColor color,
-                                                            const char* fmt,
-                                                            ...);
-
 }  // namespace internal
 
 // The pure interface class that all value-parameterized tests inherit from.
@@ -1969,19 +1962,38 @@
 // Boolean assertions. Condition can be either a Boolean expression or an
 // AssertionResult. For more information on how to use AssertionResult with
 // these macros see comments on that class.
-#define EXPECT_TRUE(condition) \
+#define GTEST_EXPECT_TRUE(condition) \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_NONFATAL_FAILURE_)
-#define EXPECT_FALSE(condition) \
+#define GTEST_EXPECT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_NONFATAL_FAILURE_)
-#define ASSERT_TRUE(condition) \
+#define GTEST_ASSERT_TRUE(condition) \
   GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \
                       GTEST_FATAL_FAILURE_)
-#define ASSERT_FALSE(condition) \
+#define GTEST_ASSERT_FALSE(condition) \
   GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \
                       GTEST_FATAL_FAILURE_)
 
+// Define these macros to 1 to omit the definition of the corresponding
+// EXPECT or ASSERT, which clashes with some users' own code.
+
+#if !GTEST_DONT_DEFINE_EXPECT_TRUE
+#define EXPECT_TRUE(condition) GTEST_EXPECT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_EXPECT_FALSE
+#define EXPECT_FALSE(condition) GTEST_EXPECT_FALSE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_TRUE
+#define ASSERT_TRUE(condition) GTEST_ASSERT_TRUE(condition)
+#endif
+
+#if !GTEST_DONT_DEFINE_ASSERT_FALSE
+#define ASSERT_FALSE(condition) GTEST_ASSERT_FALSE(condition)
+#endif
+
 // Macros for testing equalities and inequalities.
 //
 //    * {ASSERT|EXPECT}_EQ(v1, v2): Tests that v1 == v2
@@ -2480,4 +2492,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
index d514255..5029a9b 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest_pred_impl.h
@@ -33,8 +33,8 @@
 // Implements a family of generic predicate assertion macros.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
 
 #include "gtest/gtest.h"
 
@@ -356,4 +356,4 @@
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
index e651671..38b9d85 100644
--- a/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
+++ b/libvpx/third_party/googletest/src/include/gtest/gtest_prod.h
@@ -31,8 +31,8 @@
 // Google C++ Testing and Mocking Framework definitions useful in production code.
 // GOOGLETEST_CM0003 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_
-#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
+#define GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
 
 // When you need to test the private or protected members of a class,
 // use the FRIEND_TEST macro to declare your tests as friends of the
@@ -58,4 +58,4 @@
 #define FRIEND_TEST(test_case_name, test_name)\
 friend class test_case_name##_##test_name##_Test
 
-#endif  // GTEST_INCLUDE_GTEST_GTEST_PROD_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_GTEST_PROD_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
index cd85d95..db02881 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-port.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PORT_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
index eb4467a..b9495d8 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest-printers.h
@@ -36,7 +36,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_PRINTERS_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
index 4c8e07b..afaaf17 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/custom/gtest.h
@@ -31,7 +31,7 @@
 //
 // ** Custom implementation starts here **
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_CUSTOM_GTEST_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
index 68bd353..490296d 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-death-test-internal.h
@@ -33,8 +33,8 @@
 // death tests.  They are subject to change without notice.
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
 
 #include "gtest/gtest-matchers.h"
 #include "gtest/internal/gtest-internal.h"
@@ -301,4 +301,4 @@
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
index c11b101..0c033ab 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-filepath.h
@@ -37,8 +37,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
 
 #include "gtest/internal/gtest-string.h"
 
@@ -195,7 +195,7 @@
 
   void Normalize();
 
-  // Returns a pointer to the last occurence of a valid path separator in
+  // Returns a pointer to the last occurrence of a valid path separator in
   // the FilePath. On Windows, for example, both '/' and '\' are valid path
   // separators. Returns NULL if no path separator was found.
   const char* FindLastPathSeparator() const;
@@ -208,4 +208,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
index 6bad878..f8cbdbd 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-internal.h
@@ -34,8 +34,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
 
 #include "gtest/internal/gtest-port.h"
 
@@ -90,7 +90,9 @@
 #define GTEST_STRINGIFY_HELPER_(name, ...) #name
 #define GTEST_STRINGIFY_(...) GTEST_STRINGIFY_HELPER_(__VA_ARGS__, )
 
-namespace proto2 { class Message; }
+namespace proto2 {
+class MessageLite;
+}
 
 namespace testing {
 
@@ -285,7 +287,7 @@
   //
   // See the following article for more details on ULP:
   // http://randomascii.wordpress.com/2012/02/25/comparing-floating-point-numbers-2012-edition/
-  static const size_t kMaxUlps = 4;
+  static const uint32_t kMaxUlps = 4;
 
   // Constructs a FloatingPoint from a raw floating-point number.
   //
@@ -518,6 +520,7 @@
 
   static SetUpTearDownSuiteFuncType GetSetUpCaseOrSuite(const char* filename,
                                                         int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::SetUpTestCase, &Test::SetUpTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -529,10 +532,16 @@
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::SetUpTestSuite;
+#endif
   }
 
   static SetUpTearDownSuiteFuncType GetTearDownCaseOrSuite(const char* filename,
                                                            int line_num) {
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
     SetUpTearDownSuiteFuncType test_case_fp =
         GetNotDefaultOrNull(&T::TearDownTestCase, &Test::TearDownTestCase);
     SetUpTearDownSuiteFuncType test_suite_fp =
@@ -544,6 +553,11 @@
         << filename << ":" << line_num;
 
     return test_case_fp != nullptr ? test_case_fp : test_suite_fp;
+#else
+    (void)(filename);
+    (void)(line_num);
+    return &T::TearDownTestSuite;
+#endif
   }
 };
 
@@ -552,11 +566,11 @@
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
-//   type_param        the name of the test's type parameter, or NULL if
+//   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
-//   value_param       text representation of the test's value parameter,
+//   value_param:      text representation of the test's value parameter,
 //                     or NULL if this is not a type-parameterized test.
 //   code_location:    code location where the test is defined
 //   fixture_class_id: ID of the test fixture class
@@ -576,8 +590,6 @@
 // and returns false.  None of pstr, *pstr, and prefix can be NULL.
 GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr);
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 GTEST_DISABLE_MSC_WARNINGS_PUSH_(4251 \
 /* class A needs to have dll-interface to be used by clients of class B */)
 
@@ -809,8 +821,6 @@
   }
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 // Returns the current OS stack trace as an std::string.
 //
 // The maximum number of stack frames to be included is specified by
@@ -878,11 +888,34 @@
 #define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \
   typename std::remove_const<typename std::remove_reference<T>::type>::type
 
-// IsAProtocolMessage<T>::value is a compile-time bool constant that's
-// true if and only if T is type proto2::Message or a subclass of it.
+// HasDebugStringAndShortDebugString<T>::value is a compile-time bool constant
+// that's true if and only if T has methods DebugString() and ShortDebugString()
+// that return std::string.
 template <typename T>
-struct IsAProtocolMessage
-    : public std::is_convertible<const T*, const ::proto2::Message*> {};
+class HasDebugStringAndShortDebugString {
+ private:
+  template <typename C>
+  static auto CheckDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().DebugString())>::type;
+  template <typename>
+  static std::false_type CheckDebugString(...);
+
+  template <typename C>
+  static auto CheckShortDebugString(C*) -> typename std::is_same<
+      std::string, decltype(std::declval<const C>().ShortDebugString())>::type;
+  template <typename>
+  static std::false_type CheckShortDebugString(...);
+
+  using HasDebugStringType = decltype(CheckDebugString<T>(nullptr));
+  using HasShortDebugStringType = decltype(CheckShortDebugString<T>(nullptr));
+
+ public:
+  static constexpr bool value =
+      HasDebugStringType::value && HasShortDebugStringType::value;
+};
+
+template <typename T>
+constexpr bool HasDebugStringAndShortDebugString<T>::value;
 
 // When the compiler sees expression IsContainerTest<C>(0), if C is an
 // STL-style container class, the first overload of IsContainerTest
@@ -1118,8 +1151,6 @@
   const Element* array_;
   size_t size_;
   void (NativeArray::*clone_)(const Element*, size_t);
-
-  GTEST_DISALLOW_ASSIGN_(NativeArray);
 };
 
 // Backport of std::index_sequence.
@@ -1143,12 +1174,18 @@
 // Backport of std::make_index_sequence.
 // It uses O(ln(N)) instantiation depth.
 template <size_t N>
-struct MakeIndexSequence
-    : DoubleSequence<N % 2 == 1, typename MakeIndexSequence<N / 2>::type,
+struct MakeIndexSequenceImpl
+    : DoubleSequence<N % 2 == 1, typename MakeIndexSequenceImpl<N / 2>::type,
                      N / 2>::type {};
 
 template <>
-struct MakeIndexSequence<0> : IndexSequence<> {};
+struct MakeIndexSequenceImpl<0> : IndexSequence<> {};
+
+template <size_t N>
+using MakeIndexSequence = typename MakeIndexSequenceImpl<N>::type;
+
+template <typename... T>
+using IndexSequenceFor = typename MakeIndexSequence<sizeof...(T)>::type;
 
 template <size_t>
 struct Ignore {
@@ -1174,6 +1211,8 @@
           static_cast<T (*)()>(nullptr)...));
 };
 
+struct FlatTupleConstructTag {};
+
 template <typename... T>
 class FlatTuple;
 
@@ -1184,7 +1223,9 @@
 struct FlatTupleElemBase<FlatTuple<T...>, I> {
   using value_type = typename ElemFromList<I, T...>::type;
   FlatTupleElemBase() = default;
-  explicit FlatTupleElemBase(value_type t) : value(std::move(t)) {}
+  template <typename Arg>
+  explicit FlatTupleElemBase(FlatTupleConstructTag, Arg&& t)
+      : value(std::forward<Arg>(t)) {}
   value_type value;
 };
 
@@ -1196,8 +1237,30 @@
     : FlatTupleElemBase<FlatTuple<T...>, Idx>... {
   using Indices = IndexSequence<Idx...>;
   FlatTupleBase() = default;
-  explicit FlatTupleBase(T... t)
-      : FlatTupleElemBase<FlatTuple<T...>, Idx>(std::move(t))... {}
+  template <typename... Args>
+  explicit FlatTupleBase(FlatTupleConstructTag, Args&&... args)
+      : FlatTupleElemBase<FlatTuple<T...>, Idx>(FlatTupleConstructTag{},
+                                                std::forward<Args>(args))... {}
+
+  template <size_t I>
+  const typename ElemFromList<I, T...>::type& Get() const {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <size_t I>
+  typename ElemFromList<I, T...>::type& Get() {
+    return FlatTupleElemBase<FlatTuple<T...>, I>::value;
+  }
+
+  template <typename F>
+  auto Apply(F&& f) -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
+
+  template <typename F>
+  auto Apply(F&& f) const -> decltype(std::forward<F>(f)(this->Get<Idx>()...)) {
+    return std::forward<F>(f)(Get<Idx>()...);
+  }
 };
 
 // Analog to std::tuple but with different tradeoffs.
@@ -1218,17 +1281,12 @@
 
  public:
   FlatTuple() = default;
-  explicit FlatTuple(T... t) : FlatTuple::FlatTupleBase(std::move(t)...) {}
+  template <typename... Args>
+  explicit FlatTuple(FlatTupleConstructTag tag, Args&&... args)
+      : FlatTuple::FlatTupleBase(tag, std::forward<Args>(args)...) {}
 
-  template <size_t I>
-  const typename ElemFromList<I, T...>::type& Get() const {
-    return static_cast<const FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
-
-  template <size_t I>
-  typename ElemFromList<I, T...>::type& Get() {
-    return static_cast<FlatTupleElemBase<FlatTuple, I>*>(this)->value;
-  }
+  using FlatTuple::FlatTupleBase::Apply;
+  using FlatTuple::FlatTupleBase::Get;
 };
 
 // Utility functions to be called with static_assert to induce deprecation
@@ -1261,6 +1319,22 @@
 }  // namespace internal
 }  // namespace testing
 
+namespace std {
+// Some standard library implementations use `struct tuple_size` and some use
+// `class tuple_size`. Clang warns about the mismatch.
+// https://reviews.llvm.org/D55466
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wmismatched-tags"
+#endif
+template <typename... Ts>
+struct tuple_size<testing::internal::FlatTuple<Ts...>>
+    : std::integral_constant<size_t, sizeof...(Ts)> {};
+#ifdef __clang__
+#pragma clang diagnostic pop
+#endif
+}  // namespace std
+
 #define GTEST_MESSAGE_AT_(file, line, message, result_type) \
   ::testing::internal::AssertHelper(result_type, file, line, message) \
     = ::testing::Message()
@@ -1283,44 +1357,98 @@
 // Suppress MSVC warning 4072 (unreachable code) for the code following
 // statement if it returns or throws (or doesn't return or throw in some
 // situations).
+// NOTE: The "else" is important to keep this expansion to prevent a top-level
+// "else" from attaching to our "if".
 #define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \
-  if (::testing::internal::AlwaysTrue()) { statement; }
-
-#define GTEST_TEST_THROW_(statement, expected_exception, fail) \
-  GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
-  if (::testing::internal::ConstCharPtr gtest_msg = "") { \
-    bool gtest_caught_expected = false; \
-    try { \
-      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \
-    } \
-    catch (expected_exception const&) { \
-      gtest_caught_expected = true; \
-    } \
-    catch (...) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws a different type."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-    if (!gtest_caught_expected) { \
-      gtest_msg.value = \
-          "Expected: " #statement " throws an exception of type " \
-          #expected_exception ".\n  Actual: it throws nothing."; \
-      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \
-    } \
-  } else \
-    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \
-      fail(gtest_msg.value)
+  if (::testing::internal::AlwaysTrue()) {                        \
+    statement;                                                    \
+  } else                     /* NOLINT */                         \
+    static_assert(true, "")  // User must have a semicolon after expansion.
 
 #if GTEST_HAS_EXCEPTIONS
 
-#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_() \
-  catch (std::exception const& e) { \
-    gtest_msg.value = ( \
-      "it throws std::exception-derived exception with description: \"" \
-    ); \
-    gtest_msg.value += e.what(); \
-    gtest_msg.value += "\"."; \
+namespace testing {
+namespace internal {
+
+class NeverThrown {
+ public:
+  const char* what() const noexcept {
+    return "this exception should never be thrown";
+  }
+};
+
+}  // namespace internal
+}  // namespace testing
+
+#if GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) ::testing::internal::GetTypeName(typeid(e))
+
+#else  // GTEST_HAS_RTTI
+
+#define GTEST_EXCEPTION_TYPE_(e) \
+  std::string { "an std::exception-derived error" }
+
+#endif  // GTEST_HAS_RTTI
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)   \
+  catch (typename std::conditional<                                            \
+         std::is_same<typename std::remove_cv<typename std::remove_reference<  \
+                          expected_exception>::type>::type,                    \
+                      std::exception>::value,                                  \
+         const ::testing::internal::NeverThrown&, const std::exception&>::type \
+             e) {                                                              \
+    gtest_msg.value = "Expected: " #statement                                  \
+                      " throws an exception of type " #expected_exception      \
+                      ".\n  Actual: it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                               \
+    gtest_msg.value += " with description \"";                                 \
+    gtest_msg.value += e.what();                                               \
+    gtest_msg.value += "\".";                                                  \
+    goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);                \
+  }
+
+#else  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)
+
+#endif  // GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_THROW_(statement, expected_exception, fail)              \
+  GTEST_AMBIGUOUS_ELSE_BLOCKER_                                             \
+  if (::testing::internal::TrueWithString gtest_msg{}) {                    \
+    bool gtest_caught_expected = false;                                     \
+    try {                                                                   \
+      GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement);            \
+    } catch (expected_exception const&) {                                   \
+      gtest_caught_expected = true;                                         \
+    }                                                                       \
+    GTEST_TEST_THROW_CATCH_STD_EXCEPTION_(statement, expected_exception)    \
+    catch (...) {                                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws a different type.";         \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+    if (!gtest_caught_expected) {                                           \
+      gtest_msg.value = "Expected: " #statement                             \
+                        " throws an exception of type " #expected_exception \
+                        ".\n  Actual: it throws nothing.";                  \
+      goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__);           \
+    }                                                                       \
+  } else /*NOLINT*/                                                         \
+    GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__)                   \
+        : fail(gtest_msg.value.c_str())
+
+#if GTEST_HAS_EXCEPTIONS
+
+#define GTEST_TEST_NO_THROW_CATCH_STD_EXCEPTION_()                \
+  catch (std::exception const& e) {                               \
+    gtest_msg.value = "it throws ";                               \
+    gtest_msg.value += GTEST_EXCEPTION_TYPE_(e);                  \
+    gtest_msg.value += " with description \"";                    \
+    gtest_msg.value += e.what();                                  \
+    gtest_msg.value += "\".";                                     \
     goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \
   }
 
@@ -1367,7 +1495,7 @@
 
 // Implements Boolean test assertions such as EXPECT_TRUE. expression can be
 // either a boolean expression or an AssertionResult. text is a textual
-// represenation of expression as it was passed into the EXPECT_TRUE.
+// representation of expression as it was passed into the EXPECT_TRUE.
 #define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \
   GTEST_AMBIGUOUS_ELSE_BLOCKER_ \
   if (const ::testing::AssertionResult gtest_ar_ = \
@@ -1404,7 +1532,7 @@
   class GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)                    \
       : public parent_class {                                                 \
    public:                                                                    \
-    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() {}                   \
+    GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() = default;           \
     ~GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)() override = default; \
     GTEST_DISALLOW_COPY_AND_ASSIGN_(GTEST_TEST_CLASS_NAME_(test_suite_name,   \
                                                            test_name));       \
@@ -1429,4 +1557,4 @@
               test_suite_name, test_name)>);                                  \
   void GTEST_TEST_CLASS_NAME_(test_suite_name, test_name)::TestBody()
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
index 7f7a13b..c2ef6e3 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-param-util.h
@@ -32,8 +32,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
 
 #include <ctype.h>
 
@@ -459,7 +459,7 @@
 
   // Base part of test suite name for display purposes.
   virtual const std::string& GetTestSuiteName() const = 0;
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   virtual TypeId GetTestSuiteTypeId() const = 0;
   // UnitTest class invokes this method to register tests in this
   // test suite right before running them in RUN_ALL_TESTS macro.
@@ -478,7 +478,7 @@
 //
 // Report a the name of a test_suit as safe to ignore
 // as the side effect of construction of this type.
-struct MarkAsIgnored {
+struct GTEST_API_ MarkAsIgnored {
   explicit MarkAsIgnored(const char* test_suite);
 };
 
@@ -507,11 +507,11 @@
                                       CodeLocation code_location)
       : test_suite_name_(name), code_location_(code_location) {}
 
-  // Test case base name for display purposes.
+  // Test suite base name for display purposes.
   const std::string& GetTestSuiteName() const override {
     return test_suite_name_;
   }
-  // Test case id to verify identity.
+  // Test suite id to verify identity.
   TypeId GetTestSuiteTypeId() const override { return GetTypeId<TestSuite>(); }
   // TEST_P macro uses AddTestPattern() to record information
   // about a single test in a LocalTestInfo structure.
@@ -520,9 +520,10 @@
   // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is
   // test suite base name and DoBar is test base name.
   void AddTestPattern(const char* test_suite_name, const char* test_base_name,
-                      TestMetaFactoryBase<ParamType>* meta_factory) {
-    tests_.push_back(std::shared_ptr<TestInfo>(
-        new TestInfo(test_suite_name, test_base_name, meta_factory)));
+                      TestMetaFactoryBase<ParamType>* meta_factory,
+                      CodeLocation code_location) {
+    tests_.push_back(std::shared_ptr<TestInfo>(new TestInfo(
+        test_suite_name, test_base_name, meta_factory, code_location)));
   }
   // INSTANTIATE_TEST_SUITE_P macro uses AddGenerator() to record information
   // about a generator.
@@ -589,7 +590,7 @@
           MakeAndRegisterTestInfo(
               test_suite_name.c_str(), test_name_stream.GetString().c_str(),
               nullptr,  // No type parameter.
-              PrintToString(*param_it).c_str(), code_location_,
+              PrintToString(*param_it).c_str(), test_info->code_location,
               GetTestSuiteTypeId(),
               SuiteApiResolver<TestSuite>::GetSetUpCaseOrSuite(file, line),
               SuiteApiResolver<TestSuite>::GetTearDownCaseOrSuite(file, line),
@@ -610,14 +611,17 @@
   // with TEST_P macro.
   struct TestInfo {
     TestInfo(const char* a_test_suite_base_name, const char* a_test_base_name,
-             TestMetaFactoryBase<ParamType>* a_test_meta_factory)
+             TestMetaFactoryBase<ParamType>* a_test_meta_factory,
+             CodeLocation a_code_location)
         : test_suite_base_name(a_test_suite_base_name),
           test_base_name(a_test_base_name),
-          test_meta_factory(a_test_meta_factory) {}
+          test_meta_factory(a_test_meta_factory),
+          code_location(a_code_location) {}
 
     const std::string test_suite_base_name;
     const std::string test_base_name;
     const std::unique_ptr<TestMetaFactoryBase<ParamType> > test_meta_factory;
+    const CodeLocation code_location;
   };
   using TestInfoContainer = ::std::vector<std::shared_ptr<TestInfo> >;
   // Records data received from INSTANTIATE_TEST_SUITE_P macros:
@@ -650,7 +654,7 @@
 
     // Check for invalid characters
     for (std::string::size_type index = 0; index < name.size(); ++index) {
-      if (!isalnum(name[index]) && name[index] != '_')
+      if (!IsAlNum(name[index]) && name[index] != '_')
         return false;
     }
 
@@ -779,10 +783,15 @@
 namespace internal {
 // Used in the Values() function to provide polymorphic capabilities.
 
+#ifdef _MSC_VER
+#pragma warning(push)
+#pragma warning(disable : 4100)
+#endif
+
 template <typename... Ts>
 class ValueArray {
  public:
-  ValueArray(Ts... v) : v_{std::move(v)...} {}
+  explicit ValueArray(Ts... v) : v_(FlatTupleConstructTag{}, std::move(v)...) {}
 
   template <typename T>
   operator ParamGenerator<T>() const {  // NOLINT
@@ -798,6 +807,10 @@
   FlatTuple<Ts...> v_;
 };
 
+#ifdef _MSC_VER
+#pragma warning(pop)
+#endif
+
 template <typename... T>
 class CartesianProductGenerator
     : public ParamGeneratorInterface<::std::tuple<T...>> {
@@ -931,4 +944,4 @@
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
index d3239b2..dd84591 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port-arch.h
@@ -32,8 +32,8 @@
 // This header file defines the GTEST_OS_* macro.
 // It is separate from gtest-port.h so that custom/gtest-port.h can include it.
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
 
 // Determines the platform on which Google Test is compiled.
 #ifdef __CYGWIN__
@@ -68,6 +68,7 @@
 # define GTEST_OS_OS2 1
 #elif defined __APPLE__
 # define GTEST_OS_MAC 1
+# include <TargetConditionals.h>
 # if TARGET_OS_IPHONE
 #  define GTEST_OS_IOS 1
 # endif
@@ -106,6 +107,8 @@
 #define GTEST_OS_ESP8266 1
 #elif defined ESP32
 #define GTEST_OS_ESP32 1
+#elif defined(__XTENSA__)
+#define GTEST_OS_XTENSA 1
 #endif  // __CYGWIN__
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_ARCH_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
index 60ff471..0953a78 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-port.h
@@ -40,8 +40,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
 
 // Environment-describing macros
 // -----------------------------
@@ -199,9 +199,18 @@
 //                                        suppressed (constant conditional).
 //   GTEST_INTENTIONAL_CONST_COND_POP_  - finish code section where MSVC C4127
 //                                        is suppressed.
+//   GTEST_INTERNAL_HAS_ANY - for enabling UniversalPrinter<std::any> or
+//                            UniversalPrinter<absl::any> specializations.
+//   GTEST_INTERNAL_HAS_OPTIONAL - for enabling UniversalPrinter<std::optional>
+//   or
+//                                 UniversalPrinter<absl::optional>
+//                                 specializations.
 //   GTEST_INTERNAL_HAS_STRING_VIEW - for enabling Matcher<std::string_view> or
 //                                    Matcher<absl::string_view>
 //                                    specializations.
+//   GTEST_INTERNAL_HAS_VARIANT - for enabling UniversalPrinter<std::variant> or
+//                                UniversalPrinter<absl::variant>
+//                                specializations.
 //
 // Synchronization:
 //   Mutex, MutexLock, ThreadLocal, GetThreadCount()
@@ -252,6 +261,8 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+
+#include <cerrno>
 #include <cstdint>
 #include <limits>
 #include <type_traits>
@@ -267,6 +278,7 @@
 #endif
 
 #include <iostream>  // NOLINT
+#include <locale>
 #include <memory>
 #include <string>  // NOLINT
 #include <tuple>
@@ -347,6 +359,10 @@
 // WindowsTypesTest.CRITICAL_SECTIONIs_RTL_CRITICAL_SECTION.
 typedef struct _RTL_CRITICAL_SECTION GTEST_CRITICAL_SECTION;
 #endif
+#elif GTEST_OS_XTENSA
+#include <unistd.h>
+// Xtensa toolchains define strcasecmp in the string.h header instead of
+// strings.h. string.h is already included.
 #else
 // This assumes that non-Windows OSes provide unistd.h. For OSes where this
 // is not the case, we need to include headers that provide the functions
@@ -367,7 +383,7 @@
 // On Android, <regex.h> is only available starting with Gingerbread.
 #  define GTEST_HAS_POSIX_RE (__ANDROID_API__ >= 9)
 # else
-#  define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS)
+#define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS && !GTEST_OS_XTENSA)
 # endif
 #endif
 
@@ -452,7 +468,7 @@
 // no support for it at least as recent as Froyo (2.2).
 #define GTEST_HAS_STD_WSTRING                                         \
   (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \
-     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266))
+     GTEST_OS_HAIKU || GTEST_OS_ESP32 || GTEST_OS_ESP8266 || GTEST_OS_XTENSA))
 
 #endif  // GTEST_HAS_STD_WSTRING
 
@@ -577,7 +593,7 @@
 // By default, we assume that stream redirection is supported on all
 // platforms except known mobile ones.
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
 #  define GTEST_HAS_STREAM_REDIRECTION 0
 # else
 #  define GTEST_HAS_STREAM_REDIRECTION 1
@@ -679,8 +695,8 @@
 // A macro to disallow copy constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_COPY_AND_ASSIGN_(type) \
-  type(type const &) = delete; \
-  GTEST_DISALLOW_ASSIGN_(type)
+  type(type const&) = delete;                 \
+  type& operator=(type const&) = delete
 
 // A macro to disallow move operator=
 // This should be used in the private: declarations for a class.
@@ -690,8 +706,8 @@
 // A macro to disallow move constructor and operator=
 // This should be used in the private: declarations for a class.
 #define GTEST_DISALLOW_MOVE_AND_ASSIGN_(type) \
-  type(type &&) noexcept = delete; \
-  GTEST_DISALLOW_MOVE_ASSIGN_(type)
+  type(type&&) noexcept = delete;             \
+  type& operator=(type&&) noexcept = delete
 
 // Tell the compiler to warn about unused return values for functions declared
 // with this macro.  The macro should be used on function declarations
@@ -918,8 +934,6 @@
   const char* full_pattern_;  // For FullMatch();
 
 # endif
-
-  GTEST_DISALLOW_ASSIGN_(RE);
 };
 
 #endif  // GTEST_USES_PCRE
@@ -1926,6 +1940,19 @@
 inline bool IsXDigit(char ch) {
   return isxdigit(static_cast<unsigned char>(ch)) != 0;
 }
+#ifdef __cpp_char8_t
+inline bool IsXDigit(char8_t ch) {
+  return isxdigit(static_cast<unsigned char>(ch)) != 0;
+}
+#endif
+inline bool IsXDigit(char16_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
+inline bool IsXDigit(char32_t ch) {
+  const unsigned char low_byte = static_cast<unsigned char>(ch);
+  return ch == low_byte && isxdigit(low_byte) != 0;
+}
 inline bool IsXDigit(wchar_t ch) {
   const unsigned char low_byte = static_cast<unsigned char>(ch);
   return ch == low_byte && isxdigit(low_byte) != 0;
@@ -1960,16 +1987,16 @@
 typedef struct _stat StatStruct;
 
 # ifdef __BORLANDC__
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return stricmp(s1, s2);
 }
 inline char* StrDup(const char* src) { return strdup(src); }
 # else  // !__BORLANDC__
 #  if GTEST_OS_WINDOWS_MOBILE
-inline int IsATTY(int /* fd */) { return 0; }
+inline int DoIsATTY(int /* fd */) { return 0; }
 #  else
-inline int IsATTY(int fd) { return _isatty(fd); }
+inline int DoIsATTY(int fd) { return _isatty(fd); }
 #  endif  // GTEST_OS_WINDOWS_MOBILE
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return _stricmp(s1, s2);
@@ -1994,7 +2021,7 @@
 typedef struct stat StatStruct;
 
 inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int Stat(const char* path, StatStruct* buf) {
   // stat function not implemented on ESP8266
   return 0;
@@ -2011,7 +2038,7 @@
 typedef struct stat StatStruct;
 
 inline int FileNo(FILE* file) { return fileno(file); }
-inline int IsATTY(int fd) { return isatty(fd); }
+inline int DoIsATTY(int fd) { return isatty(fd); }
 inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); }
 inline int StrCaseCmp(const char* s1, const char* s2) {
   return strcasecmp(s1, s2);
@@ -2022,6 +2049,17 @@
 
 #endif  // GTEST_OS_WINDOWS
 
+inline int IsATTY(int fd) {
+  // DoIsATTY might change errno (for example ENOTTY in case you redirect stdout
+  // to a file on Linux), which is unexpected, so save the previous value, and
+  // restore it after the call.
+  int savedErrno = errno;
+  int isAttyValue = DoIsATTY(fd);
+  errno = savedErrno;
+
+  return isAttyValue;
+}
+
 // Functions deprecated by MSVC 8.0.
 
 GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
@@ -2030,11 +2068,20 @@
 // StrError() aren't needed on Windows CE at this time and thus not
 // defined there.
 
-#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && !GTEST_OS_WINDOWS_RT
+#if !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_WINDOWS_PHONE && \
+    !GTEST_OS_WINDOWS_RT && !GTEST_OS_ESP8266 && !GTEST_OS_XTENSA
 inline int ChDir(const char* dir) { return chdir(dir); }
 #endif
 inline FILE* FOpen(const char* path, const char* mode) {
+#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
+  struct wchar_codecvt : public std::codecvt<wchar_t, char, std::mbstate_t> {};
+  std::wstring_convert<wchar_codecvt> converter;
+  std::wstring wide_path = converter.from_bytes(path);
+  std::wstring wide_mode = converter.from_bytes(mode);
+  return _wfopen(wide_path.c_str(), wide_mode.c_str());
+#else  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
   return fopen(path, mode);
+#endif  // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MINGW
 }
 #if !GTEST_OS_WINDOWS_MOBILE
 inline FILE *FReopen(const char* path, const char* mode, FILE* stream) {
@@ -2055,7 +2102,7 @@
 #endif
 inline const char* GetEnv(const char* name) {
 #if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // We are on an embedded platform, which has no environment variables.
   static_cast<void>(name);  // To prevent 'unused argument' warning.
   return nullptr;
@@ -2191,7 +2238,8 @@
 // Parses 'str' for a 32-bit signed integer.  If successful, writes the result
 // to *value and returns true; otherwise leaves *value unchanged and returns
 // false.
-bool ParseInt32(const Message& src_text, const char* str, int32_t* value);
+GTEST_API_ bool ParseInt32(const Message& src_text, const char* str,
+                           int32_t* value);
 
 // Parses a bool/int32_t/string from the environment variable
 // corresponding to the given Google Test flag.
@@ -2224,6 +2272,64 @@
 #endif  // !defined(GTEST_INTERNAL_DEPRECATED)
 
 #if GTEST_HAS_ABSL
+// Always use absl::any for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include "absl/types/any.h"
+namespace testing {
+namespace internal {
+using Any = ::absl::any;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<any>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::any for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_ANY 1
+#include <any>
+namespace testing {
+namespace internal {
+using Any = ::std::any;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::any is not
+// supported.
+#endif  // __has_include(<any>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
+// Always use absl::optional for UniversalPrinter<> specializations if
+// googletest is built with absl support.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include "absl/types/optional.h"
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::absl::optional<T>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<optional>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::optional for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_OPTIONAL 1
+#include <optional>
+namespace testing {
+namespace internal {
+template <typename T>
+using Optional = ::std::optional<T>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::optional is not
+// supported.
+#endif  // __has_include(<optional>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#if GTEST_HAS_ABSL
 // Always use absl::string_view for Matcher<> specializations if googletest
 // is built with absl support.
 # define GTEST_INTERNAL_HAS_STRING_VIEW 1
@@ -2251,4 +2357,33 @@
 # endif  // __has_include
 #endif  // GTEST_HAS_ABSL
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
+#if GTEST_HAS_ABSL
+// Always use absl::variant for UniversalPrinter<> specializations if googletest
+// is built with absl support.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include "absl/types/variant.h"
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::absl::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+#else
+#ifdef __has_include
+#if __has_include(<variant>) && __cplusplus >= 201703L
+// Otherwise for C++17 and higher use std::variant for UniversalPrinter<>
+// specializations.
+#define GTEST_INTERNAL_HAS_VARIANT 1
+#include <variant>
+namespace testing {
+namespace internal {
+template <typename... T>
+using Variant = ::std::variant<T...>;
+}  // namespace internal
+}  // namespace testing
+// The case where absl is configured NOT to alias std::variant is not supported.
+#endif  // __has_include(<variant>) && __cplusplus >= 201703L
+#endif  // __has_include
+#endif  // GTEST_HAS_ABSL
+
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
index 0b2a91a..10f774f 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-string.h
@@ -38,8 +38,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
 
 #ifdef __BORLANDC__
 // string.h is not guaranteed to provide strcpy on C++ Builder.
@@ -149,6 +149,9 @@
   // Formats an int value as "%02d".
   static std::string FormatIntWidth2(int value);  // "%02d" for width == 2
 
+  // Formats an int value to given width with leading zeros.
+  static std::string FormatIntWidthN(int value, int width);
+
   // Formats an int value as "%X".
   static std::string FormatHexInt(int value);
 
@@ -169,4 +172,4 @@
 }  // namespace internal
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_
diff --git a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
index 082fdad..b87a2e2 100644
--- a/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
+++ b/libvpx/third_party/googletest/src/include/gtest/internal/gtest-type-util.h
@@ -32,8 +32,8 @@
 
 // GOOGLETEST_CM0001 DO NOT DELETE
 
-#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
-#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#ifndef GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#define GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
 
 #include "gtest/internal/gtest-port.h"
 
@@ -64,37 +64,39 @@
   return s;
 }
 
-// GetTypeName<T>() returns a human-readable name of type T.
-// NB: This function is also used in Google Mock, so don't move it inside of
-// the typed-test-only section below.
-template <typename T>
-std::string GetTypeName() {
-# if GTEST_HAS_RTTI
-
-  const char* const name = typeid(T).name();
-#  if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
+#if GTEST_HAS_RTTI
+// GetTypeName(const std::type_info&) returns a human-readable name of type T.
+inline std::string GetTypeName(const std::type_info& type) {
+  const char* const name = type.name();
+#if GTEST_HAS_CXXABI_H_ || defined(__HP_aCC)
   int status = 0;
   // gcc's implementation of typeid(T).name() mangles the type name,
   // so we have to demangle it.
-#   if GTEST_HAS_CXXABI_H_
+#if GTEST_HAS_CXXABI_H_
   using abi::__cxa_demangle;
-#   endif  // GTEST_HAS_CXXABI_H_
+#endif  // GTEST_HAS_CXXABI_H_
   char* const readable_name = __cxa_demangle(name, nullptr, nullptr, &status);
   const std::string name_str(status == 0 ? readable_name : name);
   free(readable_name);
   return CanonicalizeForStdLibVersioning(name_str);
-#  else
+#else
   return name;
-#  endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
-
-# else
-
-  return "<type>";
-
-# endif  // GTEST_HAS_RTTI
+#endif  // GTEST_HAS_CXXABI_H_ || __HP_aCC
 }
+#endif  // GTEST_HAS_RTTI
 
-#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
+// GetTypeName<T>() returns a human-readable name of type T if and only if
+// RTTI is enabled, otherwise it returns a dummy type name.
+// NB: This function is also used in Google Mock, so don't move it inside of
+// the typed-test-only section below.
+template <typename T>
+std::string GetTypeName() {
+#if GTEST_HAS_RTTI
+  return GetTypeName(typeid(T));
+#else
+  return "<type>";
+#endif  // GTEST_HAS_RTTI
+}
 
 // A unique type indicating an empty node
 struct None {};
@@ -171,8 +173,6 @@
   using type = typename proxy::type;
 };
 
-#endif  // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 
 template <typename... Ts>
@@ -180,4 +180,4 @@
 
 }  // namespace testing
 
-#endif  // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
+#endif  // GOOGLETEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_
diff --git a/libvpx/third_party/googletest/src/src/gtest-death-test.cc b/libvpx/third_party/googletest/src/src/gtest-death-test.cc
index 5d1031b..bf4f633 100644
--- a/libvpx/third_party/googletest/src/src/gtest-death-test.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-death-test.cc
@@ -32,6 +32,7 @@
 
 #include "gtest/gtest-death-test.h"
 
+#include <functional>
 #include <utility>
 
 #include "gtest/internal/gtest-port.h"
@@ -247,7 +248,7 @@
     msg << "detected " << thread_count << " threads.";
   }
   msg << " See "
-         "https://github.com/google/googletest/blob/master/googletest/docs/"
+         "https://github.com/google/googletest/blob/master/docs/"
          "advanced.md#death-tests-and-threads"
       << " for more explanation and suggested solutions, especially if"
       << " this is the last message you see before your test times out.";
@@ -864,7 +865,7 @@
   }
 
   int size() {
-    return args_.size() - 1;
+    return static_cast<int>(args_.size()) - 1;
   }
 
  private:
@@ -890,18 +891,17 @@
 
   // Register to wait for the child process to terminate.
   status_zx = child_process_.wait_async(
-      port, kProcessKey, ZX_PROCESS_TERMINATED, ZX_WAIT_ASYNC_ONCE);
+      port, kProcessKey, ZX_PROCESS_TERMINATED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for the socket to be readable or closed.
   status_zx = stderr_socket_.wait_async(
-      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-      ZX_WAIT_ASYNC_ONCE);
+      port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   // Register to wait for an exception.
   status_zx = exception_channel_.wait_async(
-      port, kExceptionKey, ZX_CHANNEL_READABLE, ZX_WAIT_ASYNC_ONCE);
+      port, kExceptionKey, ZX_CHANNEL_READABLE, 0);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
   bool process_terminated = false;
@@ -941,8 +941,7 @@
         } else {
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_ERR_SHOULD_WAIT);
           status_zx = stderr_socket_.wait_async(
-              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED,
-              ZX_WAIT_ASYNC_ONCE);
+              port, kSocketKey, ZX_SOCKET_READABLE | ZX_SOCKET_PEER_CLOSED, 0);
           GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
         }
       } else {
@@ -955,12 +954,12 @@
   ReadAndInterpretStatusByte();
 
   zx_info_process_t buffer;
-  status_zx = child_process_.get_info(
-      ZX_INFO_PROCESS, &buffer, sizeof(buffer), nullptr, nullptr);
+  status_zx = child_process_.get_info(ZX_INFO_PROCESS, &buffer, sizeof(buffer),
+                                      nullptr, nullptr);
   GTEST_DEATH_TEST_CHECK_(status_zx == ZX_OK);
 
-  GTEST_DEATH_TEST_CHECK_(buffer.exited);
-  set_status(buffer.return_code);
+  GTEST_DEATH_TEST_CHECK_(buffer.flags & ZX_INFO_PROCESS_FLAG_EXITED);
+  set_status(static_cast<int>(buffer.return_code));
   return status();
 }
 
@@ -1225,21 +1224,9 @@
   int close_fd;       // File descriptor to close; the read end of a pipe
 };
 
-#  if GTEST_OS_MAC
-inline char** GetEnviron() {
-  // When Google Test is built as a framework on MacOS X, the environ variable
-  // is unavailable. Apple's documentation (man environ) recommends using
-  // _NSGetEnviron() instead.
-  return *_NSGetEnviron();
-}
-#  else
-// Some POSIX platforms expect you to declare environ. extern "C" makes
-// it reside in the global namespace.
+#  if GTEST_OS_QNX
 extern "C" char** environ;
-inline char** GetEnviron() { return environ; }
-#  endif  // GTEST_OS_MAC
-
-#  if !GTEST_OS_QNX
+#  else  // GTEST_OS_QNX
 // The main function for a threadsafe-style death test child process.
 // This function is called in a clone()-ed process and thus must avoid
 // any potentially unsafe operations like malloc or libc functions.
@@ -1259,18 +1246,18 @@
     return EXIT_FAILURE;
   }
 
-  // We can safely call execve() as it's a direct system call.  We
+  // We can safely call execv() as it's almost a direct system call. We
   // cannot use execvp() as it's a libc function and thus potentially
-  // unsafe.  Since execve() doesn't search the PATH, the user must
+  // unsafe.  Since execv() doesn't search the PATH, the user must
   // invoke the test program via a valid path that contains at least
   // one path separator.
-  execve(args->argv[0], args->argv, GetEnviron());
-  DeathTestAbort(std::string("execve(") + args->argv[0] + ", ...) in " +
+  execv(args->argv[0], args->argv);
+  DeathTestAbort(std::string("execv(") + args->argv[0] + ", ...) in " +
                  original_dir + " failed: " +
                  GetLastErrnoDescription());
   return EXIT_FAILURE;
 }
-#  endif  // !GTEST_OS_QNX
+#  endif  // GTEST_OS_QNX
 
 #  if GTEST_HAS_CLONE
 // Two utility routines that together determine the direction the stack
@@ -1284,19 +1271,24 @@
 // correct answer.
 static void StackLowerThanAddress(const void* ptr,
                                   bool* result) GTEST_NO_INLINE_;
+// Make sure sanitizers do not tamper with the stack here.
+// Ideally, we want to use `__builtin_frame_address` instead of a local variable
+// address with sanitizer disabled, but it does not work when the
+// compiler optimizes the stack frame out, which happens on PowerPC targets.
 // HWAddressSanitizer add a random tag to the MSB of the local variable address,
 // making comparison result unpredictable.
+GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static void StackLowerThanAddress(const void* ptr, bool* result) {
-  int dummy;
-  *result = (&dummy < ptr);
+  int dummy = 0;
+  *result = std::less<const void*>()(&dummy, ptr);
 }
 
 // Make sure AddressSanitizer does not tamper with the stack here.
 GTEST_ATTRIBUTE_NO_SANITIZE_ADDRESS_
 GTEST_ATTRIBUTE_NO_SANITIZE_HWADDRESS_
 static bool StackGrowsDown() {
-  int dummy;
+  int dummy = 0;
   bool result;
   StackLowerThanAddress(&dummy, &result);
   return result;
@@ -1339,8 +1331,7 @@
                                         fd_flags | FD_CLOEXEC));
   struct inheritance inherit = {0};
   // spawn is a system call.
-  child_pid =
-      spawn(args.argv[0], 0, nullptr, &inherit, args.argv, GetEnviron());
+  child_pid = spawn(args.argv[0], 0, nullptr, &inherit, args.argv, environ);
   // Restores the current working directory.
   GTEST_DEATH_TEST_CHECK_(fchdir(cwd_fd) != -1);
   GTEST_DEATH_TEST_CHECK_SYSCALL_(close(cwd_fd));
diff --git a/libvpx/third_party/googletest/src/src/gtest-filepath.cc b/libvpx/third_party/googletest/src/src/gtest-filepath.cc
index 9aad12f..0b56294 100644
--- a/libvpx/third_party/googletest/src/src/gtest-filepath.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-filepath.cc
@@ -92,8 +92,9 @@
 
 // Returns the current working directory, or "" if unsuccessful.
 FilePath FilePath::GetCurrentDir() {
-#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE || \
-    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32
+#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_WINDOWS_PHONE ||         \
+    GTEST_OS_WINDOWS_RT || GTEST_OS_ESP8266 || GTEST_OS_ESP32 || \
+    GTEST_OS_XTENSA
   // These platforms do not have a current directory, so we just return
   // something reasonable.
   return FilePath(kCurrentDirectoryString);
@@ -209,7 +210,7 @@
   delete [] unicode;
   return attributes != kInvalidFileAttributes;
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   return posix::Stat(pathname_.c_str(), &file_stat) == 0;
 #endif  // GTEST_OS_WINDOWS_MOBILE
 }
@@ -236,7 +237,7 @@
     result = true;
   }
 #else
-  posix::StatStruct file_stat;
+  posix::StatStruct file_stat{};
   result = posix::Stat(path.c_str(), &file_stat) == 0 &&
       posix::IsDir(file_stat);
 #endif  // GTEST_OS_WINDOWS_MOBILE
@@ -323,7 +324,7 @@
   delete [] unicode;
 #elif GTEST_OS_WINDOWS
   int result = _mkdir(pathname_.c_str());
-#elif GTEST_OS_ESP8266
+#elif GTEST_OS_ESP8266 || GTEST_OS_XTENSA
   // do nothing
   int result = 0;
 #else
@@ -349,33 +350,19 @@
 // For example, "bar///foo" becomes "bar/foo". Does not eliminate other
 // redundancies that might be in a pathname involving "." or "..".
 void FilePath::Normalize() {
-  if (pathname_.c_str() == nullptr) {
-    pathname_ = "";
-    return;
-  }
-  const char* src = pathname_.c_str();
-  char* const dest = new char[pathname_.length() + 1];
-  char* dest_ptr = dest;
-  memset(dest_ptr, 0, pathname_.length() + 1);
+  auto out = pathname_.begin();
 
-  while (*src != '\0') {
-    *dest_ptr = *src;
-    if (!IsPathSeparator(*src)) {
-      src++;
+  for (const char character : pathname_) {
+    if (!IsPathSeparator(character)) {
+      *(out++) = character;
+    } else if (out == pathname_.begin() || *std::prev(out) != kPathSeparator) {
+      *(out++) = kPathSeparator;
     } else {
-#if GTEST_HAS_ALT_PATH_SEP_
-      if (*dest_ptr == kAlternatePathSeparator) {
-        *dest_ptr = kPathSeparator;
-      }
-#endif
-      while (IsPathSeparator(*src))
-        src++;
+      continue;
     }
-    dest_ptr++;
   }
-  *dest_ptr = '\0';
-  pathname_ = dest;
-  delete[] dest;
+
+  pathname_.erase(out, pathname_.end());
 }
 
 }  // namespace internal
diff --git a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
index e42ff47..6d8cecb 100644
--- a/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
+++ b/libvpx/third_party/googletest/src/src/gtest-internal-inl.h
@@ -31,8 +31,8 @@
 // This file contains purely Google Test's internal implementation.  Please
 // DO NOT #INCLUDE IT IN A USER PROGRAM.
 
-#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_
-#define GTEST_SRC_GTEST_INTERNAL_INL_H_
+#ifndef GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
+#define GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
 
 #ifndef _WIN32_WCE
 # include <errno.h>
@@ -84,9 +84,11 @@
 const char kBreakOnFailureFlag[] = "break_on_failure";
 const char kCatchExceptionsFlag[] = "catch_exceptions";
 const char kColorFlag[] = "color";
+const char kFailFast[] = "fail_fast";
 const char kFilterFlag[] = "filter";
 const char kListTestsFlag[] = "list_tests";
 const char kOutputFlag[] = "output";
+const char kBriefFlag[] = "brief";
 const char kPrintTimeFlag[] = "print_time";
 const char kPrintUTF8Flag[] = "print_utf8";
 const char kRandomSeedFlag[] = "random_seed";
@@ -164,10 +166,12 @@
     color_ = GTEST_FLAG(color);
     death_test_style_ = GTEST_FLAG(death_test_style);
     death_test_use_fork_ = GTEST_FLAG(death_test_use_fork);
+    fail_fast_ = GTEST_FLAG(fail_fast);
     filter_ = GTEST_FLAG(filter);
     internal_run_death_test_ = GTEST_FLAG(internal_run_death_test);
     list_tests_ = GTEST_FLAG(list_tests);
     output_ = GTEST_FLAG(output);
+    brief_ = GTEST_FLAG(brief);
     print_time_ = GTEST_FLAG(print_time);
     print_utf8_ = GTEST_FLAG(print_utf8);
     random_seed_ = GTEST_FLAG(random_seed);
@@ -187,9 +191,11 @@
     GTEST_FLAG(death_test_style) = death_test_style_;
     GTEST_FLAG(death_test_use_fork) = death_test_use_fork_;
     GTEST_FLAG(filter) = filter_;
+    GTEST_FLAG(fail_fast) = fail_fast_;
     GTEST_FLAG(internal_run_death_test) = internal_run_death_test_;
     GTEST_FLAG(list_tests) = list_tests_;
     GTEST_FLAG(output) = output_;
+    GTEST_FLAG(brief) = brief_;
     GTEST_FLAG(print_time) = print_time_;
     GTEST_FLAG(print_utf8) = print_utf8_;
     GTEST_FLAG(random_seed) = random_seed_;
@@ -208,10 +214,12 @@
   std::string color_;
   std::string death_test_style_;
   bool death_test_use_fork_;
+  bool fail_fast_;
   std::string filter_;
   std::string internal_run_death_test_;
   bool list_tests_;
   std::string output_;
+  bool brief_;
   bool print_time_;
   bool print_utf8_;
   int32_t random_seed_;
@@ -386,13 +394,6 @@
 
   // Functions for processing the gtest_filter flag.
 
-  // Returns true if and only if the wildcard pattern matches the string.
-  // The first ':' or '\0' character in pattern marks the end of it.
-  //
-  // This recursive algorithm isn't very efficient, but is clear and
-  // works well enough for matching test names, which are short.
-  static bool PatternMatchesString(const char *pattern, const char *str);
-
   // Returns true if and only if the user-specified filter matches the test
   // suite name and the test name.
   static bool FilterMatchesTest(const std::string& test_suite_name,
@@ -647,10 +648,10 @@
   // Arguments:
   //
   //   test_suite_name: name of the test suite
-  //   type_param:     the name of the test's type parameter, or NULL if
-  //                   this is not a typed or a type-parameterized test.
-  //   set_up_tc:      pointer to the function that sets up the test suite
-  //   tear_down_tc:   pointer to the function that tears down the test suite
+  //   type_param:      the name of the test's type parameter, or NULL if
+  //                    this is not a typed or a type-parameterized test.
+  //   set_up_tc:       pointer to the function that sets up the test suite
+  //   tear_down_tc:    pointer to the function that tears down the test suite
   TestSuite* GetTestSuite(const char* test_suite_name, const char* type_param,
                           internal::SetUpTestSuiteFunc set_up_tc,
                           internal::TearDownTestSuiteFunc tear_down_tc);
@@ -674,6 +675,7 @@
   void AddTestInfo(internal::SetUpTestSuiteFunc set_up_tc,
                    internal::TearDownTestSuiteFunc tear_down_tc,
                    TestInfo* test_info) {
+#if GTEST_HAS_DEATH_TEST
     // In order to support thread-safe death tests, we need to
     // remember the original working directory when the test program
     // was first invoked.  We cannot do this in RUN_ALL_TESTS(), as
@@ -686,6 +688,7 @@
       GTEST_CHECK_(!original_working_dir_.IsEmpty())
           << "Failed to get the current working directory.";
     }
+#endif  // GTEST_HAS_DEATH_TEST
 
     GetTestSuite(test_info->test_suite_name(), test_info->type_param(),
                  set_up_tc, tear_down_tc)
@@ -1161,13 +1164,13 @@
   }
 
   // Note that "event=TestCaseStart" is a wire format and has to remain
-  // "case" for compatibilty
+  // "case" for compatibility
   void OnTestCaseStart(const TestCase& test_case) override {
     SendLn(std::string("event=TestCaseStart&name=") + test_case.name());
   }
 
   // Note that "event=TestCaseEnd" is a wire format and has to remain
-  // "case" for compatibilty
+  // "case" for compatibility
   void OnTestCaseEnd(const TestCase& test_case) override {
     SendLn("event=TestCaseEnd&passed=" + FormatBool(test_case.Passed()) +
            "&elapsed_time=" + StreamableToString(test_case.elapsed_time()) +
@@ -1215,4 +1218,4 @@
 
 GTEST_DISABLE_MSC_WARNINGS_POP_()  //  4251
 
-#endif  // GTEST_SRC_GTEST_INTERNAL_INL_H_
+#endif  // GOOGLETEST_SRC_GTEST_INTERNAL_INL_H_
diff --git a/libvpx/third_party/googletest/src/src/gtest-port.cc b/libvpx/third_party/googletest/src/src/gtest-port.cc
index a05c50a..53a4d37 100644
--- a/libvpx/third_party/googletest/src/src/gtest-port.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-port.cc
@@ -198,7 +198,8 @@
   if (sysctl(mib, miblen, NULL, &size, NULL, 0)) {
     return 0;
   }
-  mib[5] = size / mib[4];
+
+  mib[5] = static_cast<int>(size / static_cast<size_t>(mib[4]));
 
   // populate array of structs
   struct kinfo_proc info[mib[5]];
@@ -207,8 +208,8 @@
   }
 
   // exclude empty members
-  int nthreads = 0;
-  for (int i = 0; i < size / mib[4]; i++) {
+  size_t nthreads = 0;
+  for (size_t i = 0; i < size / static_cast<size_t>(mib[4]); i++) {
     if (info[i].p_tid != -1)
       nthreads++;
   }
@@ -687,8 +688,8 @@
   static Mutex thread_map_mutex_;
 };
 
-Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);
-Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);
+Mutex ThreadLocalRegistryImpl::mutex_(Mutex::kStaticMutex);  // NOLINT
+Mutex ThreadLocalRegistryImpl::thread_map_mutex_(Mutex::kStaticMutex);  // NOLINT
 
 ThreadLocalValueHolderBase* ThreadLocalRegistry::GetValueOnCurrentThread(
       const ThreadLocalBase* thread_local_instance) {
@@ -1094,9 +1095,9 @@
     filename_ = temp_file_path;
 # else
     // There's no guarantee that a test has write access to the current
-    // directory, so we create the temporary file in the /tmp directory
-    // instead. We use /tmp on most systems, and /sdcard on Android.
-    // That's because Android doesn't have /tmp.
+    // directory, so we create the temporary file in a temporary directory.
+    std::string name_template;
+
 #  if GTEST_OS_LINUX_ANDROID
     // Note: Android applications are expected to call the framework's
     // Context.getExternalStorageDirectory() method through JNI to get
@@ -1109,17 +1110,46 @@
     // The location /data/local/tmp is directly accessible from native code.
     // '/sdcard' and other variants cannot be relied on, as they are not
     // guaranteed to be mounted, or may have a delay in mounting.
-    char name_template[] = "/data/local/tmp/gtest_captured_stream.XXXXXX";
+    name_template = "/data/local/tmp/";
+#  elif GTEST_OS_IOS
+    char user_temp_dir[PATH_MAX + 1];
+
+    // Documented alternative to NSTemporaryDirectory() (for obtaining creating
+    // a temporary directory) at
+    // https://developer.apple.com/library/archive/documentation/Security/Conceptual/SecureCodingGuide/Articles/RaceConditions.html#//apple_ref/doc/uid/TP40002585-SW10
+    //
+    // _CS_DARWIN_USER_TEMP_DIR (as well as _CS_DARWIN_USER_CACHE_DIR) is not
+    // documented in the confstr() man page at
+    // https://developer.apple.com/library/archive/documentation/System/Conceptual/ManPages_iPhoneOS/man3/confstr.3.html#//apple_ref/doc/man/3/confstr
+    // but are still available, according to the WebKit patches at
+    // https://trac.webkit.org/changeset/262004/webkit
+    // https://trac.webkit.org/changeset/263705/webkit
+    //
+    // The confstr() implementation falls back to getenv("TMPDIR"). See
+    // https://opensource.apple.com/source/Libc/Libc-1439.100.3/gen/confstr.c.auto.html
+    ::confstr(_CS_DARWIN_USER_TEMP_DIR, user_temp_dir, sizeof(user_temp_dir));
+
+    name_template = user_temp_dir;
+    if (name_template.back() != GTEST_PATH_SEP_[0])
+      name_template.push_back(GTEST_PATH_SEP_[0]);
 #  else
-    char name_template[] = "/tmp/captured_stream.XXXXXX";
-#  endif  // GTEST_OS_LINUX_ANDROID
-    const int captured_fd = mkstemp(name_template);
+    name_template = "/tmp/";
+#  endif
+    name_template.append("gtest_captured_stream.XXXXXX");
+
+    // mkstemp() modifies the string bytes in place, and does not go beyond the
+    // string's length. This results in well-defined behavior in C++17.
+    //
+    // The const_cast is needed below C++17. The constraints on std::string
+    // implementations in C++11 and above make assumption behind the const_cast
+    // fairly safe.
+    const int captured_fd = ::mkstemp(const_cast<char*>(name_template.data()));
     if (captured_fd == -1) {
       GTEST_LOG_(WARNING)
           << "Failed to create tmp file " << name_template
           << " for test; does the test have access to the /tmp directory?";
     }
-    filename_ = name_template;
+    filename_ = std::move(name_template);
 # endif  // GTEST_OS_WINDOWS
     fflush(nullptr);
     dup2(captured_fd, fd_);
diff --git a/libvpx/third_party/googletest/src/src/gtest-printers.cc b/libvpx/third_party/googletest/src/src/gtest-printers.cc
index 3337be3..1b68fcb 100644
--- a/libvpx/third_party/googletest/src/src/gtest-printers.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-printers.cc
@@ -42,11 +42,16 @@
 // defines Foo.
 
 #include "gtest/gtest-printers.h"
+
 #include <stdio.h>
+
 #include <cctype>
+#include <cstdint>
 #include <cwchar>
 #include <ostream>  // NOLINT
 #include <string>
+#include <type_traits>
+
 #include "gtest/internal/gtest-port.h"
 #include "src/gtest-internal-inl.h"
 
@@ -102,9 +107,19 @@
   *os << ">";
 }
 
+// Helpers for widening a character to char32_t. Since the standard does not
+// specify if char / wchar_t is signed or unsigned, it is important to first
+// convert it to the unsigned type of the same width before widening it to
+// char32_t.
+template <typename CharType>
+char32_t ToChar32(CharType in) {
+  return static_cast<char32_t>(
+      static_cast<typename std::make_unsigned<CharType>::type>(in));
+}
+
 }  // namespace
 
-namespace internal2 {
+namespace internal {
 
 // Delegates to PrintBytesInObjectToImpl() to print the bytes in the
 // given object.  The delegation simplifies the implementation, which
@@ -116,10 +131,6 @@
   PrintBytesInObjectToImpl(obj_bytes, count, os);
 }
 
-}  // namespace internal2
-
-namespace internal {
-
 // Depending on the value of a char (or wchar_t), we print it in one
 // of three formats:
 //   - as is if it's a printable ASCII (e.g. 'a', '2', ' '),
@@ -134,18 +145,15 @@
 // Returns true if c is a printable ASCII character.  We test the
 // value of c directly instead of calling isprint(), which is buggy on
 // Windows Mobile.
-inline bool IsPrintableAscii(wchar_t c) {
-  return 0x20 <= c && c <= 0x7E;
-}
+inline bool IsPrintableAscii(char32_t c) { return 0x20 <= c && c <= 0x7E; }
 
-// Prints a wide or narrow char c as a character literal without the
-// quotes, escaping it when necessary; returns how c was formatted.
-// The template argument UnsignedChar is the unsigned version of Char,
-// which is the type of c.
-template <typename UnsignedChar, typename Char>
+// Prints c (of type char, char8_t, char16_t, char32_t, or wchar_t) as a
+// character literal without the quotes, escaping it when necessary; returns how
+// c was formatted.
+template <typename Char>
 static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) {
-  wchar_t w_c = static_cast<wchar_t>(c);
-  switch (w_c) {
+  const char32_t u_c = ToChar32(c);
+  switch (u_c) {
     case L'\0':
       *os << "\\0";
       break;
@@ -177,13 +185,12 @@
       *os << "\\v";
       break;
     default:
-      if (IsPrintableAscii(w_c)) {
+      if (IsPrintableAscii(u_c)) {
         *os << static_cast<char>(c);
         return kAsIs;
       } else {
         ostream::fmtflags flags = os->flags();
-        *os << "\\x" << std::hex << std::uppercase
-            << static_cast<int>(static_cast<UnsignedChar>(c));
+        *os << "\\x" << std::hex << std::uppercase << static_cast<int>(u_c);
         os->flags(flags);
         return kHexEscape;
       }
@@ -191,9 +198,9 @@
   return kSpecialEscape;
 }
 
-// Prints a wchar_t c as if it's part of a string literal, escaping it when
+// Prints a char32_t c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
-static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+static CharFormat PrintAsStringLiteralTo(char32_t c, ostream* os) {
   switch (c) {
     case L'\'':
       *os << "'";
@@ -202,26 +209,68 @@
       *os << "\\\"";
       return kSpecialEscape;
     default:
-      return PrintAsCharLiteralTo<wchar_t>(c, os);
+      return PrintAsCharLiteralTo(c, os);
   }
 }
 
+static const char* GetCharWidthPrefix(char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(signed char) {
+  return "";
+}
+
+static const char* GetCharWidthPrefix(unsigned char) {
+  return "";
+}
+
+#ifdef __cpp_char8_t
+static const char* GetCharWidthPrefix(char8_t) {
+  return "u8";
+}
+#endif
+
+static const char* GetCharWidthPrefix(char16_t) {
+  return "u";
+}
+
+static const char* GetCharWidthPrefix(char32_t) {
+  return "U";
+}
+
+static const char* GetCharWidthPrefix(wchar_t) {
+  return "L";
+}
+
 // Prints a char c as if it's part of a string literal, escaping it when
 // necessary; returns how c was formatted.
 static CharFormat PrintAsStringLiteralTo(char c, ostream* os) {
-  return PrintAsStringLiteralTo(
-      static_cast<wchar_t>(static_cast<unsigned char>(c)), os);
+  return PrintAsStringLiteralTo(ToChar32(c), os);
 }
 
-// Prints a wide or narrow character c and its code.  '\0' is printed
-// as "'\\0'", other unprintable characters are also properly escaped
-// using the standard C++ escape sequence.  The template argument
-// UnsignedChar is the unsigned version of Char, which is the type of c.
-template <typename UnsignedChar, typename Char>
+#ifdef __cpp_char8_t
+static CharFormat PrintAsStringLiteralTo(char8_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+#endif
+
+static CharFormat PrintAsStringLiteralTo(char16_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+static CharFormat PrintAsStringLiteralTo(wchar_t c, ostream* os) {
+  return PrintAsStringLiteralTo(ToChar32(c), os);
+}
+
+// Prints a character c (of type char, char8_t, char16_t, char32_t, or wchar_t)
+// and its code. '\0' is printed as "'\\0'", other unprintable characters are
+// also properly escaped using the standard C++ escape sequence.
+template <typename Char>
 void PrintCharAndCodeTo(Char c, ostream* os) {
   // First, print c as a literal in the most readable form we can find.
-  *os << ((sizeof(c) > 1) ? "L'" : "'");
-  const CharFormat format = PrintAsCharLiteralTo<UnsignedChar>(c, os);
+  *os << GetCharWidthPrefix(c) << "'";
+  const CharFormat format = PrintAsCharLiteralTo(c, os);
   *os << "'";
 
   // To aid user debugging, we also print c's code in decimal, unless
@@ -242,21 +291,21 @@
   *os << ")";
 }
 
-void PrintTo(unsigned char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
-void PrintTo(signed char c, ::std::ostream* os) {
-  PrintCharAndCodeTo<unsigned char>(c, os);
-}
+void PrintTo(unsigned char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
+void PrintTo(signed char c, ::std::ostream* os) { PrintCharAndCodeTo(c, os); }
 
 // Prints a wchar_t as a symbol if it is printable or as its internal
 // code otherwise and also as its code.  L'\0' is printed as "L'\\0'".
-void PrintTo(wchar_t wc, ostream* os) {
-  PrintCharAndCodeTo<wchar_t>(wc, os);
+void PrintTo(wchar_t wc, ostream* os) { PrintCharAndCodeTo(wc, os); }
+
+// TODO(dcheng): Consider making this delegate to PrintCharAndCodeTo() as well.
+void PrintTo(char32_t c, ::std::ostream* os) {
+  *os << std::hex << "U+" << std::uppercase << std::setfill('0') << std::setw(4)
+      << static_cast<uint32_t>(c);
 }
 
 // Prints the given array of characters to the ostream.  CharType must be either
-// char or wchar_t.
+// char, char8_t, char16_t, char32_t, or wchar_t.
 // The array starts at begin, the length is len, it may include '\0' characters
 // and may not be NUL-terminated.
 template <typename CharType>
@@ -266,8 +315,8 @@
 GTEST_ATTRIBUTE_NO_SANITIZE_THREAD_
 static CharFormat PrintCharsAsStringTo(
     const CharType* begin, size_t len, ostream* os) {
-  const char* const kQuoteBegin = sizeof(CharType) == 1 ? "\"" : "L\"";
-  *os << kQuoteBegin;
+  const char* const quote_prefix = GetCharWidthPrefix(*begin);
+  *os << quote_prefix << "\"";
   bool is_previous_hex = false;
   CharFormat print_format = kAsIs;
   for (size_t index = 0; index < len; ++index) {
@@ -276,7 +325,7 @@
       // Previous character is of '\x..' form and this character can be
       // interpreted as another hexadecimal digit in its number. Break string to
       // disambiguate.
-      *os << "\" " << kQuoteBegin;
+      *os << "\" " << quote_prefix << "\"";
     }
     is_previous_hex = PrintAsStringLiteralTo(cur, os) == kHexEscape;
     // Remember if any characters required hex escaping.
@@ -322,22 +371,57 @@
   UniversalPrintCharArray(begin, len, os);
 }
 
+#ifdef __cpp_char8_t
+// Prints a (const) char8_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char8_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+#endif
+
+// Prints a (const) char16_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char16_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
+// Prints a (const) char32_t array of 'len' elements, starting at address
+// 'begin'.
+void UniversalPrintArray(const char32_t* begin, size_t len, ostream* os) {
+  UniversalPrintCharArray(begin, len, os);
+}
+
 // Prints a (const) wchar_t array of 'len' elements, starting at address
 // 'begin'.
 void UniversalPrintArray(const wchar_t* begin, size_t len, ostream* os) {
   UniversalPrintCharArray(begin, len, os);
 }
 
-// Prints the given C string to the ostream.
-void PrintTo(const char* s, ostream* os) {
+namespace {
+
+// Prints a null-terminated C-style string to the ostream.
+template <typename Char>
+void PrintCStringTo(const Char* s, ostream* os) {
   if (s == nullptr) {
     *os << "NULL";
   } else {
     *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, strlen(s), os);
+    PrintCharsAsStringTo(s, std::char_traits<Char>::length(s), os);
   }
 }
 
+}  // anonymous namespace
+
+void PrintTo(const char* s, ostream* os) { PrintCStringTo(s, os); }
+
+#ifdef __cpp_char8_t
+void PrintTo(const char8_t* s, ostream* os) { PrintCStringTo(s, os); }
+#endif
+
+void PrintTo(const char16_t* s, ostream* os) { PrintCStringTo(s, os); }
+
+void PrintTo(const char32_t* s, ostream* os) { PrintCStringTo(s, os); }
+
 // MSVC compiler can be configured to define whar_t as a typedef
 // of unsigned short. Defining an overload for const wchar_t* in that case
 // would cause pointers to unsigned shorts be printed as wide strings,
@@ -346,14 +430,7 @@
 // wchar_t is implemented as a native type.
 #if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED)
 // Prints the given wide C string to the ostream.
-void PrintTo(const wchar_t* s, ostream* os) {
-  if (s == nullptr) {
-    *os << "NULL";
-  } else {
-    *os << ImplicitCast_<const void*>(s) << " pointing to ";
-    PrintCharsAsStringTo(s, wcslen(s), os);
-  }
-}
+void PrintTo(const wchar_t* s, ostream* os) { PrintCStringTo(s, os); }
 #endif  // wchar_t is native
 
 namespace {
@@ -431,6 +508,20 @@
   }
 }
 
+#ifdef __cpp_char8_t
+void PrintU8StringTo(const ::std::u8string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+#endif
+
+void PrintU16StringTo(const ::std::u16string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
+void PrintU32StringTo(const ::std::u32string& s, ostream* os) {
+  PrintCharsAsStringTo(s.data(), s.size(), os);
+}
+
 #if GTEST_HAS_STD_WSTRING
 void PrintWideStringTo(const ::std::wstring& s, ostream* os) {
   PrintCharsAsStringTo(s.data(), s.size(), os);
diff --git a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
index 1b1cfb0..c02c3df 100644
--- a/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
+++ b/libvpx/third_party/googletest/src/src/gtest-typed-test.cc
@@ -35,8 +35,6 @@
 namespace testing {
 namespace internal {
 
-#if GTEST_HAS_TYPED_TEST_P
-
 // Skips to the first non-space char in str. Returns an empty string if str
 // contains only whitespace characters.
 static const char* SkipSpaces(const char* str) {
@@ -78,17 +76,7 @@
       continue;
     }
 
-    bool found = false;
-    for (RegisteredTestIter it = registered_tests_.begin();
-         it != registered_tests_.end();
-         ++it) {
-      if (name == it->first) {
-        found = true;
-        break;
-      }
-    }
-
-    if (found) {
+    if (registered_tests_.count(name) != 0) {
       tests.insert(name);
     } else {
       errors << "No test named " << name
@@ -115,7 +103,5 @@
   return registered_tests;
 }
 
-#endif  // GTEST_HAS_TYPED_TEST_P
-
 }  // namespace internal
 }  // namespace testing
diff --git a/libvpx/third_party/googletest/src/src/gtest.cc b/libvpx/third_party/googletest/src/src/gtest.cc
index b8f6a5c..21c611a 100644
--- a/libvpx/third_party/googletest/src/src/gtest.cc
+++ b/libvpx/third_party/googletest/src/src/gtest.cc
@@ -35,7 +35,6 @@
 #include "gtest/gtest-spi.h"
 
 #include <ctype.h>
-#include <math.h>
 #include <stdarg.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -44,6 +43,8 @@
 #include <wctype.h>
 
 #include <algorithm>
+#include <chrono>  // NOLINT
+#include <cmath>
 #include <cstdint>
 #include <iomanip>
 #include <limits>
@@ -55,8 +56,6 @@
 
 #if GTEST_OS_LINUX
 
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
 # include <fcntl.h>  // NOLINT
 # include <limits.h>  // NOLINT
 # include <sched.h>  // NOLINT
@@ -68,7 +67,6 @@
 # include <string>
 
 #elif GTEST_OS_ZOS
-# define GTEST_HAS_GETTIMEOFDAY_ 1
 # include <sys/time.h>  // NOLINT
 
 // On z/OS we additionally need strings.h for strcasecmp.
@@ -86,7 +84,6 @@
 
 #ifdef _MSC_VER
 # include <crtdbg.h>  // NOLINT
-# include <debugapi.h>  // NOLINT
 #endif
 
 # include <io.h>  // NOLINT
@@ -95,16 +92,11 @@
 # include <sys/stat.h>  // NOLINT
 
 # if GTEST_OS_WINDOWS_MINGW
-// MinGW has gettimeofday() but not _ftime64().
-#  define GTEST_HAS_GETTIMEOFDAY_ 1
 #  include <sys/time.h>  // NOLINT
 # endif  // GTEST_OS_WINDOWS_MINGW
 
 #else
 
-// Assume other platforms have gettimeofday().
-# define GTEST_HAS_GETTIMEOFDAY_ 1
-
 // cpplint thinks that the header is already included, so we want to
 // silence it.
 # include <sys/time.h>  // NOLINT
@@ -213,6 +205,21 @@
   return kUniversalFilter;
 }
 
+// Bazel passes in the argument to '--test_runner_fail_fast' via the
+// TESTBRIDGE_TEST_RUNNER_FAIL_FAST environment variable.
+static bool GetDefaultFailFast() {
+  const char* const testbridge_test_runner_fail_fast =
+      internal::posix::GetEnv("TESTBRIDGE_TEST_RUNNER_FAIL_FAST");
+  if (testbridge_test_runner_fail_fast != nullptr) {
+    return strcmp(testbridge_test_runner_fail_fast, "1") == 0;
+  }
+  return false;
+}
+
+GTEST_DEFINE_bool_(
+    fail_fast, internal::BoolFromGTestEnv("fail_fast", GetDefaultFailFast()),
+    "True if and only if a test failure should stop further test execution.");
+
 GTEST_DEFINE_bool_(
     also_run_disabled_tests,
     internal::BoolFromGTestEnv("also_run_disabled_tests", false),
@@ -273,6 +280,10 @@
     "executable's name and, if necessary, made unique by adding "
     "digits.");
 
+GTEST_DEFINE_bool_(
+    brief, internal::BoolFromGTestEnv("brief", false),
+    "True if only test failures should be displayed in text output.");
+
 GTEST_DEFINE_bool_(print_time, internal::BoolFromGTestEnv("print_time", true),
                    "True if and only if " GTEST_NAME_
                    " should display elapsed time in text output.");
@@ -479,7 +490,7 @@
       "removed but the rest got left behind.";
 
   std::string message =
-      "Paramaterized test suite " + name +
+      "Parameterized test suite " + name +
       (has_test_p ? kMissingInstantiation : kMissingTestCase) +
       "\n\n"
       "To suppress this error for this test suite, insert the following line "
@@ -487,7 +498,7 @@
       "\n\n"
       "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" + name + ");";
 
-  std::string full_name = "UninstantiatedParamaterizedTestSuite<" + name + ">";
+  std::string full_name = "UninstantiatedParameterizedTestSuite<" + name + ">";
   RegisterTest(  //
       "GoogleTestVerification", full_name.c_str(),
       nullptr,  // No type parameter.
@@ -534,7 +545,7 @@
     if (ignored.find(testcase.first) != ignored.end()) continue;
 
     std::string message =
-        "Type paramaterized test suite " + testcase.first +
+        "Type parameterized test suite " + testcase.first +
         " is defined via REGISTER_TYPED_TEST_SUITE_P, but never instantiated "
         "via INSTANTIATE_TYPED_TEST_SUITE_P. None of the test cases will run."
         "\n\n"
@@ -544,13 +555,13 @@
         "utilities.)"
         "\n\n"
         "To suppress this error for this test suite, insert the following line "
-        "(in a non-header) in the namespace it is definedin in:"
+        "(in a non-header) in the namespace it is defined in:"
         "\n\n"
         "GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(" +
         testcase.first + ");";
 
     std::string full_name =
-        "UninstantiatedTypeParamaterizedTestSuite<" + testcase.first + ">";
+        "UninstantiatedTypeParameterizedTestSuite<" + testcase.first + ">";
     RegisterTest(  //
         "GoogleTestVerification", full_name.c_str(),
         nullptr,  // No type parameter.
@@ -635,47 +646,82 @@
   return result.string();
 }
 
-// Returns true if and only if the wildcard pattern matches the string.
-// The first ':' or '\0' character in pattern marks the end of it.
+// Returns true if and only if the wildcard pattern matches the string. Each
+// pattern consists of regular characters, single-character wildcards (?), and
+// multi-character wildcards (*).
 //
-// This recursive algorithm isn't very efficient, but is clear and
-// works well enough for matching test names, which are short.
-bool UnitTestOptions::PatternMatchesString(const char *pattern,
-                                           const char *str) {
-  switch (*pattern) {
-    case '\0':
-    case ':':  // Either ':' or '\0' marks the end of the pattern.
-      return *str == '\0';
-    case '?':  // Matches any single character.
-      return *str != '\0' && PatternMatchesString(pattern + 1, str + 1);
-    case '*':  // Matches any string (possibly empty) of characters.
-      return (*str != '\0' && PatternMatchesString(pattern, str + 1)) ||
-          PatternMatchesString(pattern + 1, str);
-    default:  // Non-special character.  Matches itself.
-      return *pattern == *str &&
-          PatternMatchesString(pattern + 1, str + 1);
+// This function implements a linear-time string globbing algorithm based on
+// https://research.swtch.com/glob.
+static bool PatternMatchesString(const std::string& name_str,
+                                 const char* pattern, const char* pattern_end) {
+  const char* name = name_str.c_str();
+  const char* const name_begin = name;
+  const char* const name_end = name + name_str.size();
+
+  const char* pattern_next = pattern;
+  const char* name_next = name;
+
+  while (pattern < pattern_end || name < name_end) {
+    if (pattern < pattern_end) {
+      switch (*pattern) {
+        default:  // Match an ordinary character.
+          if (name < name_end && *name == *pattern) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '?':  // Match any single character.
+          if (name < name_end) {
+            ++pattern;
+            ++name;
+            continue;
+          }
+          break;
+        case '*':
+          // Match zero or more characters. Start by skipping over the wildcard
+          // and matching zero characters from name. If that fails, restart and
+          // match one more character than the last attempt.
+          pattern_next = pattern;
+          name_next = name + 1;
+          ++pattern;
+          continue;
+      }
+    }
+    // Failed to match a character. Restart if possible.
+    if (name_begin < name_next && name_next <= name_end) {
+      pattern = pattern_next;
+      name = name_next;
+      continue;
+    }
+    return false;
   }
+  return true;
 }
 
-bool UnitTestOptions::MatchesFilter(
-    const std::string& name, const char* filter) {
-  const char *cur_pattern = filter;
-  for (;;) {
-    if (PatternMatchesString(cur_pattern, name.c_str())) {
+bool UnitTestOptions::MatchesFilter(const std::string& name_str,
+                                    const char* filter) {
+  // The filter is a list of patterns separated by colons (:).
+  const char* pattern = filter;
+  while (true) {
+    // Find the bounds of this pattern.
+    const char* const next_sep = strchr(pattern, ':');
+    const char* const pattern_end =
+        next_sep != nullptr ? next_sep : pattern + strlen(pattern);
+
+    // Check if this pattern matches name_str.
+    if (PatternMatchesString(name_str, pattern, pattern_end)) {
       return true;
     }
 
-    // Finds the next pattern in the filter.
-    cur_pattern = strchr(cur_pattern, ':');
-
-    // Returns if no more pattern can be found.
-    if (cur_pattern == nullptr) {
+    // Give up on this pattern. However, if we found a pattern separator (:),
+    // advance to the next pattern (skipping over the separator) and restart.
+    if (next_sep == nullptr) {
       return false;
     }
-
-    // Skips the pattern separater (the ':' character).
-    cur_pattern++;
+    pattern = next_sep + 1;
   }
+  return true;
 }
 
 // Returns true if and only if the user-specified filter matches the test
@@ -985,44 +1031,30 @@
       );  // NOLINT
 }
 
-// Returns the current time in milliseconds.
-TimeInMillis GetTimeInMillis() {
-#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__)
-  // Difference between 1970-01-01 and 1601-01-01 in milliseconds.
-  // http://analogous.blogspot.com/2005/04/epoch.html
-  const TimeInMillis kJavaEpochToWinFileTimeDelta =
-    static_cast<TimeInMillis>(116444736UL) * 100000UL;
-  const DWORD kTenthMicrosInMilliSecond = 10000;
+// A helper class for measuring elapsed times.
+class Timer {
+ public:
+  Timer() : start_(std::chrono::steady_clock::now()) {}
 
-  SYSTEMTIME now_systime;
-  FILETIME now_filetime;
-  ULARGE_INTEGER now_int64;
-  GetSystemTime(&now_systime);
-  if (SystemTimeToFileTime(&now_systime, &now_filetime)) {
-    now_int64.LowPart = now_filetime.dwLowDateTime;
-    now_int64.HighPart = now_filetime.dwHighDateTime;
-    now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) -
-      kJavaEpochToWinFileTimeDelta;
-    return now_int64.QuadPart;
+  // Return time elapsed in milliseconds since the timer was created.
+  TimeInMillis Elapsed() {
+    return std::chrono::duration_cast<std::chrono::milliseconds>(
+               std::chrono::steady_clock::now() - start_)
+        .count();
   }
-  return 0;
-#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_
-  __timeb64 now;
 
-  // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996
-  // (deprecated function) there.
-  GTEST_DISABLE_MSC_DEPRECATED_PUSH_()
-  _ftime64(&now);
-  GTEST_DISABLE_MSC_DEPRECATED_POP_()
+ private:
+  std::chrono::steady_clock::time_point start_;
+};
 
-  return static_cast<TimeInMillis>(now.time) * 1000 + now.millitm;
-#elif GTEST_HAS_GETTIMEOFDAY_
-  struct timeval now;
-  gettimeofday(&now, nullptr);
-  return static_cast<TimeInMillis>(now.tv_sec) * 1000 + now.tv_usec / 1000;
-#else
-# error "Don't know how to get the current time on your system."
-#endif
+// Returns a timestamp as milliseconds since the epoch. Note this time may jump
+// around subject to adjustments by the system, to measure elapsed time use
+// Timer instead.
+TimeInMillis GetTimeInMillis() {
+  return std::chrono::duration_cast<std::chrono::milliseconds>(
+             std::chrono::system_clock::now() -
+             std::chrono::system_clock::from_time_t(0))
+      .count();
 }
 
 // Utilities
@@ -1537,6 +1569,31 @@
   const double diff = fabs(val1 - val2);
   if (diff <= abs_error) return AssertionSuccess();
 
+  // Find the value which is closest to zero.
+  const double min_abs = std::min(fabs(val1), fabs(val2));
+  // Find the distance to the next double from that value.
+  const double epsilon =
+      nextafter(min_abs, std::numeric_limits<double>::infinity()) - min_abs;
+  // Detect the case where abs_error is so small that EXPECT_NEAR is
+  // effectively the same as EXPECT_EQUAL, and give an informative error
+  // message so that the situation can be more easily understood without
+  // requiring exotic floating-point knowledge.
+  // Don't do an epsilon check if abs_error is zero because that implies
+  // that an equality check was actually intended.
+  if (!(std::isnan)(val1) && !(std::isnan)(val2) && abs_error > 0 &&
+      abs_error < epsilon) {
+    return AssertionFailure()
+           << "The difference between " << expr1 << " and " << expr2 << " is "
+           << diff << ", where\n"
+           << expr1 << " evaluates to " << val1 << ",\n"
+           << expr2 << " evaluates to " << val2 << ".\nThe abs_error parameter "
+           << abs_error_expr << " evaluates to " << abs_error
+           << " which is smaller than the minimum distance between doubles for "
+              "numbers of this magnitude which is "
+           << epsilon
+           << ", thus making this EXPECT_NEAR check equivalent to "
+              "EXPECT_EQUAL. Consider using EXPECT_DOUBLE_EQ instead.";
+  }
   return AssertionFailure()
       << "The difference between " << expr1 << " and " << expr2
       << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n"
@@ -1599,57 +1656,6 @@
 
 namespace internal {
 
-// The helper function for {ASSERT|EXPECT}_EQ with int or enum
-// arguments.
-AssertionResult CmpHelperEQ(const char* lhs_expression,
-                            const char* rhs_expression,
-                            BiggestInt lhs,
-                            BiggestInt rhs) {
-  if (lhs == rhs) {
-    return AssertionSuccess();
-  }
-
-  return EqFailure(lhs_expression,
-                   rhs_expression,
-                   FormatForComparisonFailureMessage(lhs, rhs),
-                   FormatForComparisonFailureMessage(rhs, lhs),
-                   false);
-}
-
-// A macro for implementing the helper functions needed to implement
-// ASSERT_?? and EXPECT_?? with integer or enum arguments.  It is here
-// just to avoid copy-and-paste of similar code.
-#define GTEST_IMPL_CMP_HELPER_(op_name, op)\
-AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \
-                                   BiggestInt val1, BiggestInt val2) {\
-  if (val1 op val2) {\
-    return AssertionSuccess();\
-  } else {\
-    return AssertionFailure() \
-        << "Expected: (" << expr1 << ") " #op " (" << expr2\
-        << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\
-        << " vs " << FormatForComparisonFailureMessage(val2, val1);\
-  }\
-}
-
-// Implements the helper function for {ASSERT|EXPECT}_NE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(NE, !=)
-// Implements the helper function for {ASSERT|EXPECT}_LE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LE, <=)
-// Implements the helper function for {ASSERT|EXPECT}_LT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(LT, < )
-// Implements the helper function for {ASSERT|EXPECT}_GE with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GE, >=)
-// Implements the helper function for {ASSERT|EXPECT}_GT with int or
-// enum arguments.
-GTEST_IMPL_CMP_HELPER_(GT, > )
-
-#undef GTEST_IMPL_CMP_HELPER_
-
 // The helper function for {ASSERT|EXPECT}_STREQ.
 AssertionResult CmpHelperSTREQ(const char* lhs_expression,
                                const char* rhs_expression,
@@ -2123,8 +2129,13 @@
 
 // Formats an int value as "%02d".
 std::string String::FormatIntWidth2(int value) {
+  return FormatIntWidthN(value, 2);
+}
+
+// Formats an int value to given width with leading zeros.
+std::string String::FormatIntWidthN(int value, int width) {
   std::stringstream ss;
-  ss << std::setfill('0') << std::setw(2) << value;
+  ss << std::setfill('0') << std::setw(width) << value;
   return ss.str();
 }
 
@@ -2176,7 +2187,9 @@
   if (user_msg_string.empty()) {
     return gtest_msg;
   }
-
+  if (gtest_msg.empty()) {
+    return user_msg_string;
+  }
   return gtest_msg + "\n" + user_msg_string;
 }
 
@@ -2228,7 +2241,7 @@
   if (!ValidateTestProperty(xml_element, test_property)) {
     return;
   }
-  internal::MutexLock lock(&test_properites_mutex_);
+  internal::MutexLock lock(&test_properties_mutex_);
   const std::vector<TestProperty>::iterator property_with_matching_key =
       std::find_if(test_properties_.begin(), test_properties_.end(),
                    internal::TestPropertyKeyIs(test_property.key()));
@@ -2255,7 +2268,8 @@
 // The list of reserved attributes used in the <testsuite> element of XML
 // output.
 static const char* const kReservedTestSuiteAttributes[] = {
-    "disabled", "errors", "failures", "name", "tests", "time", "timestamp"};
+    "disabled", "errors", "failures",  "name",
+    "tests",    "time",   "timestamp", "skipped"};
 
 // The list of reserved attributes used in the <testcase> element of XML output.
 static const char* const kReservedTestCaseAttributes[] = {
@@ -2268,7 +2282,7 @@
     "classname",   "name", "status", "time",   "type_param",
     "value_param", "file", "line",   "result", "timestamp"};
 
-template <int kSize>
+template <size_t kSize>
 std::vector<std::string> ArrayAsVector(const char* const (&array)[kSize]) {
   return std::vector<std::string>(array, array + kSize);
 }
@@ -2712,6 +2726,7 @@
       should_run_(false),
       is_disabled_(false),
       matches_filter_(false),
+      is_in_another_shard_(false),
       factory_(factory),
       result_() {}
 
@@ -2725,7 +2740,7 @@
 //
 // Arguments:
 //
-//   test_suite_name:   name of the test suite
+//   test_suite_name:  name of the test suite
 //   name:             name of the test
 //   type_param:       the name of the test's type parameter, or NULL if
 //                     this is not a typed or a type-parameterized test.
@@ -2827,7 +2842,8 @@
   // Notifies the unit test event listeners that a test is about to start.
   repeater->OnTestStart(*this);
 
-  const TimeInMillis start = internal::GetTimeInMillis();
+  result_.set_start_timestamp(internal::GetTimeInMillis());
+  internal::Timer timer;
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
 
@@ -2852,8 +2868,7 @@
         test, &Test::DeleteSelf_, "the test fixture's destructor");
   }
 
-  result_.set_start_timestamp(start);
-  result_.set_elapsed_time(internal::GetTimeInMillis() - start);
+  result_.set_elapsed_time(timer.Elapsed());
 
   // Notifies the unit test event listener that a test has just finished.
   repeater->OnTestEnd(*this);
@@ -2863,6 +2878,28 @@
   impl->set_current_test_info(nullptr);
 }
 
+// Skip and records a skipped test result for this object.
+void TestInfo::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_info(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Notifies the unit test event listeners that a test is about to start.
+  repeater->OnTestStart(*this);
+
+  const TestPartResult test_part_result =
+      TestPartResult(TestPartResult::kSkip, this->file(), this->line(), "");
+  impl->GetTestPartResultReporterForCurrentThread()->ReportTestPartResult(
+      test_part_result);
+
+  // Notifies the unit test event listener that a test has just finished.
+  repeater->OnTestEnd(*this);
+  impl->set_current_test_info(nullptr);
+}
+
 // class TestSuite
 
 // Gets the number of successful tests in this test suite.
@@ -2909,7 +2946,7 @@
 //
 // Arguments:
 //
-//   name:         name of the test suite
+//   a_name:       name of the test suite
 //   a_type_param: the name of the test suite's type parameter, or NULL if
 //                 this is not a typed or a type-parameterized test suite.
 //   set_up_tc:    pointer to the function that sets up the test suite
@@ -2964,19 +3001,26 @@
   // Call both legacy and the new API
   repeater->OnTestSuiteStart(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseStart(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
       this, &TestSuite::RunSetUpTestSuite, "SetUpTestSuite()");
 
   start_timestamp_ = internal::GetTimeInMillis();
+  internal::Timer timer;
   for (int i = 0; i < total_test_count(); i++) {
     GetMutableTestInfo(i)->Run();
+    if (GTEST_FLAG(fail_fast) && GetMutableTestInfo(i)->result()->Failed()) {
+      for (int j = i + 1; j < total_test_count(); j++) {
+        GetMutableTestInfo(j)->Skip();
+      }
+      break;
+    }
   }
-  elapsed_time_ = internal::GetTimeInMillis() - start_timestamp_;
+  elapsed_time_ = timer.Elapsed();
 
   impl->os_stack_trace_getter()->UponLeavingGTest();
   internal::HandleExceptionsInMethodIfSupported(
@@ -2985,9 +3029,39 @@
   // Call both legacy and the new API
   repeater->OnTestSuiteEnd(*this);
 //  Legacy API is deprecated but still available
-#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
   repeater->OnTestCaseEnd(*this);
-#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  impl->set_current_test_suite(nullptr);
+}
+
+// Skips all tests under this TestSuite.
+void TestSuite::Skip() {
+  if (!should_run_) return;
+
+  internal::UnitTestImpl* const impl = internal::GetUnitTestImpl();
+  impl->set_current_test_suite(this);
+
+  TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater();
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteStart(*this);
+//  Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseStart(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  for (int i = 0; i < total_test_count(); i++) {
+    GetMutableTestInfo(i)->Skip();
+  }
+
+  // Call both legacy and the new API
+  repeater->OnTestSuiteEnd(*this);
+  // Legacy API is deprecated but still available
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  repeater->OnTestCaseEnd(*this);
+#endif  //  GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
   impl->set_current_test_suite(nullptr);
 }
@@ -3039,7 +3113,7 @@
 static const char * TestPartResultTypeToString(TestPartResult::Type type) {
   switch (type) {
     case TestPartResult::kSkip:
-      return "Skipped";
+      return "Skipped\n";
     case TestPartResult::kSuccess:
       return "Success";
 
@@ -3056,6 +3130,9 @@
 }
 
 namespace internal {
+namespace {
+enum class GTestColor { kDefault, kRed, kGreen, kYellow };
+}  // namespace
 
 // Prints a TestPartResult to an std::string.
 static std::string PrintTestPartResultToString(
@@ -3093,9 +3170,12 @@
 // Returns the character attribute for the given color.
 static WORD GetColorAttribute(GTestColor color) {
   switch (color) {
-    case COLOR_RED:    return FOREGROUND_RED;
-    case COLOR_GREEN:  return FOREGROUND_GREEN;
-    case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN;
+    case GTestColor::kRed:
+      return FOREGROUND_RED;
+    case GTestColor::kGreen:
+      return FOREGROUND_GREEN;
+    case GTestColor::kYellow:
+      return FOREGROUND_RED | FOREGROUND_GREEN;
     default:           return 0;
   }
 }
@@ -3133,13 +3213,16 @@
 
 #else
 
-// Returns the ANSI color code for the given color.  COLOR_DEFAULT is
+// Returns the ANSI color code for the given color. GTestColor::kDefault is
 // an invalid input.
 static const char* GetAnsiColorCode(GTestColor color) {
   switch (color) {
-    case COLOR_RED:     return "1";
-    case COLOR_GREEN:   return "2";
-    case COLOR_YELLOW:  return "3";
+    case GTestColor::kRed:
+      return "1";
+    case GTestColor::kGreen:
+      return "2";
+    case GTestColor::kYellow:
+      return "3";
     default:
       return nullptr;
   }
@@ -3188,7 +3271,9 @@
 // cannot simply emit special characters and have the terminal change colors.
 // This routine must actually emit the characters rather than return a string
 // that would be colored when printed, as can be done on Linux.
-void ColoredPrintf(GTestColor color, const char* fmt, ...) {
+
+GTEST_ATTRIBUTE_PRINTF_(2, 3)
+static void ColoredPrintf(GTestColor color, const char *fmt, ...) {
   va_list args;
   va_start(args, fmt);
 
@@ -3198,7 +3283,7 @@
 #else
   static const bool in_color_mode =
       ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0);
-  const bool use_color = in_color_mode && (color != COLOR_DEFAULT);
+  const bool use_color = in_color_mode && (color != GTestColor::kDefault);
 #endif  // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_ZOS
 
   if (!use_color) {
@@ -3310,25 +3395,24 @@
   // Prints the filter if it's not *.  This reminds the user that some
   // tests may be skipped.
   if (!String::CStringEquals(filter, kUniversalFilter)) {
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: %s filter = %s\n", GTEST_NAME_, filter);
+    ColoredPrintf(GTestColor::kYellow, "Note: %s filter = %s\n", GTEST_NAME_,
+                  filter);
   }
 
   if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) {
     const int32_t shard_index = Int32FromEnvOrDie(kTestShardIndex, -1);
-    ColoredPrintf(COLOR_YELLOW,
-                  "Note: This is test shard %d of %s.\n",
+    ColoredPrintf(GTestColor::kYellow, "Note: This is test shard %d of %s.\n",
                   static_cast<int>(shard_index) + 1,
                   internal::posix::GetEnv(kTestTotalShards));
   }
 
   if (GTEST_FLAG(shuffle)) {
-    ColoredPrintf(COLOR_YELLOW,
+    ColoredPrintf(GTestColor::kYellow,
                   "Note: Randomizing tests' orders with a seed of %d .\n",
                   unit_test.random_seed());
   }
 
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("Running %s from %s.\n",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3337,7 +3421,7 @@
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment set-up.\n");
   fflush(stdout);
 }
@@ -3346,7 +3430,7 @@
 void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) {
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_case.name());
   if (test_case.type_param() == nullptr) {
     printf("\n");
@@ -3360,7 +3444,7 @@
     const TestSuite& test_suite) {
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s", counts.c_str(), test_suite.name());
   if (test_suite.type_param() == nullptr) {
     printf("\n");
@@ -3372,7 +3456,7 @@
 #endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
 
 void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) {
-  ColoredPrintf(COLOR_GREEN,  "[ RUN      ] ");
+  ColoredPrintf(GTestColor::kGreen, "[ RUN      ] ");
   PrintTestName(test_info.test_suite_name(), test_info.name());
   printf("\n");
   fflush(stdout);
@@ -3395,11 +3479,11 @@
 
 void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
   if (test_info.result()->Passed()) {
-    ColoredPrintf(COLOR_GREEN, "[       OK ] ");
+    ColoredPrintf(GTestColor::kGreen, "[       OK ] ");
   } else if (test_info.result()->Skipped()) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
   } else {
-    ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   }
   PrintTestName(test_info.test_suite_name(), test_info.name());
   if (test_info.result()->Failed())
@@ -3420,7 +3504,7 @@
 
   const std::string counts =
       FormatCountableNoun(test_case.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_case.name(),
          internal::StreamableToString(test_case.elapsed_time()).c_str());
   fflush(stdout);
@@ -3431,7 +3515,7 @@
 
   const std::string counts =
       FormatCountableNoun(test_suite.test_to_run_count(), "test", "tests");
-  ColoredPrintf(COLOR_GREEN, "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("%s from %s (%s ms total)\n\n", counts.c_str(), test_suite.name(),
          internal::StreamableToString(test_suite.elapsed_time()).c_str());
   fflush(stdout);
@@ -3440,7 +3524,7 @@
 
 void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart(
     const UnitTest& /*unit_test*/) {
-  ColoredPrintf(COLOR_GREEN,  "[----------] ");
+  ColoredPrintf(GTestColor::kGreen, "[----------] ");
   printf("Global test environment tear-down\n");
   fflush(stdout);
 }
@@ -3448,7 +3532,7 @@
 // Internal helper for printing the list of failed tests.
 void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) {
   const int failed_test_count = unit_test.failed_test_count();
-  ColoredPrintf(COLOR_RED,  "[  FAILED  ] ");
+  ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
   printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str());
 
   for (int i = 0; i < unit_test.total_test_suite_count(); ++i) {
@@ -3461,7 +3545,7 @@
       if (!test_info.should_run() || !test_info.result()->Failed()) {
         continue;
       }
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       PrintFullTestCommentIfPresent(test_info);
       printf("\n");
@@ -3482,7 +3566,7 @@
       continue;
     }
     if (test_suite.ad_hoc_test_result().Failed()) {
-      ColoredPrintf(COLOR_RED, "[  FAILED  ] ");
+      ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
       printf("%s: SetUpTestSuite or TearDownTestSuite\n", test_suite.name());
       ++suite_failure_count;
     }
@@ -3510,7 +3594,7 @@
       if (!test_info.should_run() || !test_info.result()->Skipped()) {
         continue;
       }
-      ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+      ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
       printf("%s.%s", test_suite.name(), test_info.name());
       printf("\n");
     }
@@ -3519,7 +3603,7 @@
 
 void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
                                                      int /*iteration*/) {
-  ColoredPrintf(COLOR_GREEN,  "[==========] ");
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
   printf("%s from %s ran.",
          FormatTestCount(unit_test.test_to_run_count()).c_str(),
          FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
@@ -3528,12 +3612,12 @@
            internal::StreamableToString(unit_test.elapsed_time()).c_str());
   }
   printf("\n");
-  ColoredPrintf(COLOR_GREEN,  "[  PASSED  ] ");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
   printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
 
   const int skipped_test_count = unit_test.skipped_test_count();
   if (skipped_test_count > 0) {
-    ColoredPrintf(COLOR_GREEN, "[  SKIPPED ] ");
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
     printf("%s, listed below:\n", FormatTestCount(skipped_test_count).c_str());
     PrintSkippedTests(unit_test);
   }
@@ -3548,10 +3632,8 @@
     if (unit_test.Passed()) {
       printf("\n");  // Add a spacer if no FAILURE banner is displayed.
     }
-    ColoredPrintf(COLOR_YELLOW,
-                  "  YOU HAVE %d DISABLED %s\n\n",
-                  num_disabled,
-                  num_disabled == 1 ? "TEST" : "TESTS");
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
   }
   // Ensure that Google Test output is printed before, e.g., heapchecker output.
   fflush(stdout);
@@ -3559,6 +3641,110 @@
 
 // End PrettyUnitTestResultPrinter
 
+// This class implements the TestEventListener interface.
+//
+// Class BriefUnitTestResultPrinter is copyable.
+class BriefUnitTestResultPrinter : public TestEventListener {
+ public:
+  BriefUnitTestResultPrinter() {}
+  static void PrintTestName(const char* test_suite, const char* test) {
+    printf("%s.%s", test_suite, test);
+  }
+
+  // The following methods override what's in the TestEventListener class.
+  void OnTestProgramStart(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationStart(const UnitTest& /*unit_test*/,
+                            int /*iteration*/) override {}
+  void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) override {}
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseStart(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteStart(const TestSuite& /*test_suite*/) override {}
+#endif  // OnTestCaseStart
+
+  void OnTestStart(const TestInfo& /*test_info*/) override {}
+
+  void OnTestPartResult(const TestPartResult& result) override;
+  void OnTestEnd(const TestInfo& test_info) override;
+#ifndef GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+  void OnTestCaseEnd(const TestCase& /*test_case*/) override {}
+#else
+  void OnTestSuiteEnd(const TestSuite& /*test_suite*/) override {}
+#endif  // GTEST_REMOVE_LEGACY_TEST_CASEAPI_
+
+  void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) override {}
+  void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) override {}
+  void OnTestIterationEnd(const UnitTest& unit_test, int iteration) override;
+  void OnTestProgramEnd(const UnitTest& /*unit_test*/) override {}
+};
+
+// Called after an assertion failure.
+void BriefUnitTestResultPrinter::OnTestPartResult(
+    const TestPartResult& result) {
+  switch (result.type()) {
+    // If the test part succeeded, we don't need to do anything.
+    case TestPartResult::kSuccess:
+      return;
+    default:
+      // Print failure message from the assertion
+      // (e.g. expected this and got that).
+      PrintTestPartResult(result);
+      fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) {
+  if (test_info.result()->Failed()) {
+    ColoredPrintf(GTestColor::kRed, "[  FAILED  ] ");
+    PrintTestName(test_info.test_suite_name(), test_info.name());
+    PrintFullTestCommentIfPresent(test_info);
+
+    if (GTEST_FLAG(print_time)) {
+      printf(" (%s ms)\n",
+             internal::StreamableToString(test_info.result()->elapsed_time())
+                 .c_str());
+    } else {
+      printf("\n");
+    }
+    fflush(stdout);
+  }
+}
+
+void BriefUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test,
+                                                    int /*iteration*/) {
+  ColoredPrintf(GTestColor::kGreen, "[==========] ");
+  printf("%s from %s ran.",
+         FormatTestCount(unit_test.test_to_run_count()).c_str(),
+         FormatTestSuiteCount(unit_test.test_suite_to_run_count()).c_str());
+  if (GTEST_FLAG(print_time)) {
+    printf(" (%s ms total)",
+           internal::StreamableToString(unit_test.elapsed_time()).c_str());
+  }
+  printf("\n");
+  ColoredPrintf(GTestColor::kGreen, "[  PASSED  ] ");
+  printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str());
+
+  const int skipped_test_count = unit_test.skipped_test_count();
+  if (skipped_test_count > 0) {
+    ColoredPrintf(GTestColor::kGreen, "[  SKIPPED ] ");
+    printf("%s.\n", FormatTestCount(skipped_test_count).c_str());
+  }
+
+  int num_disabled = unit_test.reportable_disabled_test_count();
+  if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) {
+    if (unit_test.Passed()) {
+      printf("\n");  // Add a spacer if no FAILURE banner is displayed.
+    }
+    ColoredPrintf(GTestColor::kYellow, "  YOU HAVE %d DISABLED %s\n\n",
+                  num_disabled, num_disabled == 1 ? "TEST" : "TESTS");
+  }
+  // Ensure that Google Test output is printed before, e.g., heapchecker output.
+  fflush(stdout);
+}
+
+// End BriefUnitTestResultPrinter
+
 // class TestEventRepeater
 //
 // This class forwards events to other event listeners.
@@ -3742,6 +3928,16 @@
   // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
   static void OutputXmlCDataSection(::std::ostream* stream, const char* data);
 
+  // Streams a test suite XML stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputXmlTestSuiteForTestResult(::std::ostream* stream,
+                                              const TestResult& result);
+
+  // Streams an XML representation of a TestResult object.
+  static void OutputXmlTestResult(::std::ostream* stream,
+                                  const TestResult& result);
+
   // Streams an XML representation of a TestInfo object.
   static void OutputXmlTestInfo(::std::ostream* stream,
                                 const char* test_suite_name,
@@ -3900,6 +4096,10 @@
   if (tm_ptr == nullptr) return false;
   *out = *tm_ptr;
   return true;
+#elif defined(__STDC_LIB_EXT1__)
+  // Uses localtime_s when available as localtime_r is only available from
+  // C23 standard.
+  return localtime_s(&seconds, out) != nullptr;
 #else
   return localtime_r(&seconds, out) != nullptr;
 #endif
@@ -3911,13 +4111,14 @@
   struct tm time_struct;
   if (!PortableLocaltime(static_cast<time_t>(ms / 1000), &time_struct))
     return "";
-  // YYYY-MM-DDThh:mm:ss
+  // YYYY-MM-DDThh:mm:ss.sss
   return StreamableToString(time_struct.tm_year + 1900) + "-" +
       String::FormatIntWidth2(time_struct.tm_mon + 1) + "-" +
       String::FormatIntWidth2(time_struct.tm_mday) + "T" +
       String::FormatIntWidth2(time_struct.tm_hour) + ":" +
       String::FormatIntWidth2(time_struct.tm_min) + ":" +
-      String::FormatIntWidth2(time_struct.tm_sec);
+      String::FormatIntWidth2(time_struct.tm_sec) + "." +
+      String::FormatIntWidthN(static_cast<int>(ms % 1000), 3);
 }
 
 // Streams an XML CDATA section, escaping invalid CDATA sequences as needed.
@@ -3956,6 +4157,43 @@
   *stream << " " << name << "=\"" << EscapeXmlAttribute(value) << "\"";
 }
 
+// Streams a test suite XML stanza containing the given test result.
+void XmlUnitTestResultPrinter::OutputXmlTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a minimal test suite with one test.
+  *stream << "  <testsuite";
+  OutputXmlAttribute(stream, "testsuite", "name", "NonTestSuiteFailure");
+  OutputXmlAttribute(stream, "testsuite", "tests", "1");
+  OutputXmlAttribute(stream, "testsuite", "failures", "1");
+  OutputXmlAttribute(stream, "testsuite", "disabled", "0");
+  OutputXmlAttribute(stream, "testsuite", "skipped", "0");
+  OutputXmlAttribute(stream, "testsuite", "errors", "0");
+  OutputXmlAttribute(stream, "testsuite", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testsuite", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+  *stream << ">";
+
+  // Output the boilerplate for a minimal test case with a single test.
+  *stream << "    <testcase";
+  OutputXmlAttribute(stream, "testcase", "name", "");
+  OutputXmlAttribute(stream, "testcase", "status", "run");
+  OutputXmlAttribute(stream, "testcase", "result", "completed");
+  OutputXmlAttribute(stream, "testcase", "classname", "");
+  OutputXmlAttribute(stream, "testcase", "time",
+                     FormatTimeInMillisAsSeconds(result.elapsed_time()));
+  OutputXmlAttribute(
+      stream, "testcase", "timestamp",
+      FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
+
+  // Output the actual test result.
+  OutputXmlTestResult(stream, result);
+
+  // Complete the test suite.
+  *stream << "  </testsuite>\n";
+}
+
 // Prints an XML representation of a TestInfo object.
 void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream,
                                                  const char* test_suite_name,
@@ -3999,11 +4237,17 @@
       FormatEpochTimeInMillisAsIso8601(result.start_timestamp()));
   OutputXmlAttribute(stream, kTestsuite, "classname", test_suite_name);
 
+  OutputXmlTestResult(stream, result);
+}
+
+void XmlUnitTestResultPrinter::OutputXmlTestResult(::std::ostream* stream,
+                                                   const TestResult& result) {
   int failures = 0;
+  int skips = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
     if (part.failed()) {
-      if (++failures == 1) {
+      if (++failures == 1 && skips == 0) {
         *stream << ">\n";
       }
       const std::string location =
@@ -4011,18 +4255,31 @@
                                                           part.line_number());
       const std::string summary = location + "\n" + part.summary();
       *stream << "      <failure message=\""
-              << EscapeXmlAttribute(summary.c_str())
+              << EscapeXmlAttribute(summary)
               << "\" type=\"\">";
       const std::string detail = location + "\n" + part.message();
       OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
       *stream << "</failure>\n";
+    } else if (part.skipped()) {
+      if (++skips == 1 && failures == 0) {
+        *stream << ">\n";
+      }
+      const std::string location =
+          internal::FormatCompilerIndependentFileLocation(part.file_name(),
+                                                          part.line_number());
+      const std::string summary = location + "\n" + part.summary();
+      *stream << "      <skipped message=\""
+              << EscapeXmlAttribute(summary.c_str()) << "\">";
+      const std::string detail = location + "\n" + part.message();
+      OutputXmlCDataSection(stream, RemoveInvalidXmlCharacters(detail).c_str());
+      *stream << "</skipped>\n";
     }
   }
 
-  if (failures == 0 && result.test_property_count() == 0) {
+  if (failures == 0 && skips == 0 && result.test_property_count() == 0) {
     *stream << " />\n";
   } else {
-    if (failures == 0) {
+    if (failures == 0 && skips == 0) {
       *stream << ">\n";
     }
     OutputXmlTestProperties(stream, result);
@@ -4044,7 +4301,11 @@
     OutputXmlAttribute(
         stream, kTestsuite, "disabled",
         StreamableToString(test_suite.reportable_disabled_test_count()));
+    OutputXmlAttribute(stream, kTestsuite, "skipped",
+                       StreamableToString(test_suite.skipped_test_count()));
+
     OutputXmlAttribute(stream, kTestsuite, "errors", "0");
+
     OutputXmlAttribute(stream, kTestsuite, "time",
                        FormatTimeInMillisAsSeconds(test_suite.elapsed_time()));
     OutputXmlAttribute(
@@ -4095,6 +4356,13 @@
     if (unit_test.GetTestSuite(i)->reportable_test_count() > 0)
       PrintXmlTestSuite(stream, *unit_test.GetTestSuite(i));
   }
+
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputXmlTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "</" << kTestsuites << ">\n";
 }
 
@@ -4185,6 +4453,16 @@
                             const std::string& indent,
                             bool comma = true);
 
+  // Streams a test suite JSON stanza containing the given test result.
+  //
+  // Requires: result.Failed()
+  static void OutputJsonTestSuiteForTestResult(::std::ostream* stream,
+                                               const TestResult& result);
+
+  // Streams a JSON representation of a TestResult object.
+  static void OutputJsonTestResult(::std::ostream* stream,
+                                   const TestResult& result);
+
   // Streams a JSON representation of a TestInfo object.
   static void OutputJsonTestInfo(::std::ostream* stream,
                                  const char* test_suite_name,
@@ -4335,6 +4613,48 @@
     *stream << ",\n";
 }
 
+// Streams a test suite JSON stanza containing the given test result.
+void JsonUnitTestResultPrinter::OutputJsonTestSuiteForTestResult(
+    ::std::ostream* stream, const TestResult& result) {
+  // Output the boilerplate for a new test suite.
+  *stream << Indent(4) << "{\n";
+  OutputJsonKey(stream, "testsuite", "name", "NonTestSuiteFailure", Indent(6));
+  OutputJsonKey(stream, "testsuite", "tests", 1, Indent(6));
+  if (!GTEST_FLAG(list_tests)) {
+    OutputJsonKey(stream, "testsuite", "failures", 1, Indent(6));
+    OutputJsonKey(stream, "testsuite", "disabled", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "skipped", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "errors", 0, Indent(6));
+    OutputJsonKey(stream, "testsuite", "time",
+                  FormatTimeInMillisAsDuration(result.elapsed_time()),
+                  Indent(6));
+    OutputJsonKey(stream, "testsuite", "timestamp",
+                  FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                  Indent(6));
+  }
+  *stream << Indent(6) << "\"testsuite\": [\n";
+
+  // Output the boilerplate for a new test case.
+  *stream << Indent(8) << "{\n";
+  OutputJsonKey(stream, "testcase", "name", "", Indent(10));
+  OutputJsonKey(stream, "testcase", "status", "RUN", Indent(10));
+  OutputJsonKey(stream, "testcase", "result", "COMPLETED", Indent(10));
+  OutputJsonKey(stream, "testcase", "timestamp",
+                FormatEpochTimeInMillisAsRFC3339(result.start_timestamp()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "time",
+                FormatTimeInMillisAsDuration(result.elapsed_time()),
+                Indent(10));
+  OutputJsonKey(stream, "testcase", "classname", "", Indent(10), false);
+  *stream << TestPropertiesAsJson(result, Indent(10));
+
+  // Output the actual test result.
+  OutputJsonTestResult(stream, result);
+
+  // Finish the test suite.
+  *stream << "\n" << Indent(6) << "]\n" << Indent(4) << "}";
+}
+
 // Prints a JSON representation of a TestInfo object.
 void JsonUnitTestResultPrinter::OutputJsonTestInfo(::std::ostream* stream,
                                                    const char* test_suite_name,
@@ -4377,6 +4697,13 @@
                 false);
   *stream << TestPropertiesAsJson(result, kIndent);
 
+  OutputJsonTestResult(stream, result);
+}
+
+void JsonUnitTestResultPrinter::OutputJsonTestResult(::std::ostream* stream,
+                                                     const TestResult& result) {
+  const std::string kIndent = Indent(10);
+
   int failures = 0;
   for (int i = 0; i < result.total_part_count(); ++i) {
     const TestPartResult& part = result.GetTestPartResult(i);
@@ -4487,6 +4814,12 @@
     }
   }
 
+  // If there was a test failure outside of one of the test suites (like in a
+  // test environment) include that in the output.
+  if (unit_test.ad_hoc_test_result().Failed()) {
+    OutputJsonTestSuiteForTestResult(stream, unit_test.ad_hoc_test_result());
+  }
+
   *stream << "\n" << kIndent << "]\n" << "}\n";
 }
 
@@ -5309,6 +5642,10 @@
     // to shut down the default XML output before invoking RUN_ALL_TESTS.
     ConfigureXmlOutput();
 
+    if (GTEST_FLAG(brief)) {
+      listeners()->SetDefaultResultPrinter(new BriefUnitTestResultPrinter);
+    }
+
 #if GTEST_CAN_STREAM_RESULTS_
     // Configures listeners for streaming test results to the specified server.
     ConfigureStreamingOutput();
@@ -5354,10 +5691,10 @@
 // Arguments:
 //
 //   test_suite_name: name of the test suite
-//   type_param:     the name of the test suite's type parameter, or NULL if
-//                   this is not a typed or a type-parameterized test suite.
-//   set_up_tc:      pointer to the function that sets up the test suite
-//   tear_down_tc:   pointer to the function that tears down the test suite
+//   type_param:      the name of the test suite's type parameter, or NULL if
+//                    this is not a typed or a type-parameterized test suite.
+//   set_up_tc:       pointer to the function that sets up the test suite
+//   tear_down_tc:    pointer to the function that tears down the test suite
 TestSuite* UnitTestImpl::GetTestSuite(
     const char* test_suite_name, const char* type_param,
     internal::SetUpTestSuiteFunc set_up_tc,
@@ -5475,7 +5812,7 @@
     // assertions executed before RUN_ALL_TESTS().
     ClearNonAdHocTestResult();
 
-    const TimeInMillis start = GetTimeInMillis();
+    Timer timer;
 
     // Shuffles test suites and tests if requested.
     if (has_tests_to_run && GTEST_FLAG(shuffle)) {
@@ -5516,6 +5853,21 @@
         for (int test_index = 0; test_index < total_test_suite_count();
              test_index++) {
           GetMutableSuiteCase(test_index)->Run();
+          if (GTEST_FLAG(fail_fast) &&
+              GetMutableSuiteCase(test_index)->Failed()) {
+            for (int j = test_index + 1; j < total_test_suite_count(); j++) {
+              GetMutableSuiteCase(j)->Skip();
+            }
+            break;
+          }
+        }
+      } else if (Test::HasFatalFailure()) {
+        // If there was a fatal failure during the global setup then we know we
+        // aren't going to run any tests. Explicitly mark all of the tests as
+        // skipped to make this obvious in the output.
+        for (int test_index = 0; test_index < total_test_suite_count();
+             test_index++) {
+          GetMutableSuiteCase(test_index)->Skip();
         }
       }
 
@@ -5526,7 +5878,7 @@
       repeater->OnEnvironmentsTearDownEnd(*parent_);
     }
 
-    elapsed_time_ = GetTimeInMillis() - start;
+    elapsed_time_ = timer.Elapsed();
 
     // Tells the unit test event listener that the tests have just finished.
     repeater->OnTestIterationEnd(*parent_, i);
@@ -5554,14 +5906,14 @@
 
   if (!gtest_is_initialized_before_run_all_tests) {
     ColoredPrintf(
-        COLOR_RED,
+        GTestColor::kRed,
         "\nIMPORTANT NOTICE - DO NOT IGNORE:\n"
         "This test program did NOT call " GTEST_INIT_GOOGLE_TEST_NAME_
         "() before calling RUN_ALL_TESTS(). This is INVALID. Soon " GTEST_NAME_
         " will start to enforce the valid usage. "
         "Please fix it ASAP, or IT WILL START TO FAIL.\n");  // NOLINT
 #if GTEST_FOR_GOOGLE_
-    ColoredPrintf(COLOR_RED,
+    ColoredPrintf(GTestColor::kRed,
                   "For more details, see http://wiki/Main/ValidGUnitMain.\n");
 #endif  // GTEST_FOR_GOOGLE_
   }
@@ -5578,7 +5930,7 @@
   if (test_shard_file != nullptr) {
     FILE* const file = posix::FOpen(test_shard_file, "w");
     if (file == nullptr) {
-      ColoredPrintf(COLOR_RED,
+      ColoredPrintf(GTestColor::kRed,
                     "Could not write to the test shard status file \"%s\" "
                     "specified by the %s environment variable.\n",
                     test_shard_file, kTestShardStatusFile);
@@ -5612,7 +5964,7 @@
       << "Invalid environment variables: you have "
       << kTestShardIndex << " = " << shard_index
       << ", but have left " << kTestTotalShards << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (total_shards != -1 && shard_index == -1) {
@@ -5620,7 +5972,7 @@
       << "Invalid environment variables: you have "
       << kTestTotalShards << " = " << total_shards
       << ", but have left " << kTestShardIndex << " unset.\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   } else if (shard_index < 0 || shard_index >= total_shards) {
@@ -5629,7 +5981,7 @@
       << kTestShardIndex << " < " << kTestTotalShards
       << ", but you have " << kTestShardIndex << "=" << shard_index
       << ", " << kTestTotalShards << "=" << total_shards << ".\n";
-    ColoredPrintf(COLOR_RED, "%s", msg.GetString().c_str());
+    ColoredPrintf(GTestColor::kRed, "%s", msg.GetString().c_str());
     fflush(stdout);
     exit(EXIT_FAILURE);
   }
@@ -6019,7 +6371,7 @@
 //   @D    changes to the default terminal text color.
 //
 static void PrintColorEncoded(const char* str) {
-  GTestColor color = COLOR_DEFAULT;  // The current color.
+  GTestColor color = GTestColor::kDefault;  // The current color.
 
   // Conceptually, we split the string into segments divided by escape
   // sequences.  Then we print one segment at a time.  At the end of
@@ -6039,13 +6391,13 @@
     if (ch == '@') {
       ColoredPrintf(color, "@");
     } else if (ch == 'D') {
-      color = COLOR_DEFAULT;
+      color = GTestColor::kDefault;
     } else if (ch == 'R') {
-      color = COLOR_RED;
+      color = GTestColor::kRed;
     } else if (ch == 'G') {
-      color = COLOR_GREEN;
+      color = GTestColor::kGreen;
     } else if (ch == 'Y') {
-      color = COLOR_YELLOW;
+      color = GTestColor::kYellow;
     } else {
       --str;
     }
@@ -6053,98 +6405,126 @@
 }
 
 static const char kColorEncodedHelpMessage[] =
-"This program contains tests written using " GTEST_NAME_ ". You can use the\n"
-"following command line flags to control its behavior:\n"
-"\n"
-"Test Selection:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n"
-"      List the names of all tests instead of running them. The name of\n"
-"      TEST(Foo, Bar) is \"Foo.Bar\".\n"
-"  @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS"
+    "This program contains tests written using " GTEST_NAME_
+    ". You can use the\n"
+    "following command line flags to control its behavior:\n"
+    "\n"
+    "Test Selection:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D\n"
+    "      List the names of all tests instead of running them. The name of\n"
+    "      TEST(Foo, Bar) is \"Foo.Bar\".\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "filter=@YPOSITIVE_PATTERNS"
     "[@G-@YNEGATIVE_PATTERNS]@D\n"
-"      Run only the tests whose name matches one of the positive patterns but\n"
-"      none of the negative patterns. '?' matches any single character; '*'\n"
-"      matches any substring; ':' separates two patterns.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n"
-"      Run all disabled tests too.\n"
-"\n"
-"Test Execution:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n"
-"      Run the tests repeatedly; use a negative count to repeat forever.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n"
-"      Randomize tests' orders on every iteration.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n"
-"      Random number seed to use for shuffling test orders (between 1 and\n"
-"      99999, or 0 to use a seed based on the current time).\n"
-"\n"
-"Test Output:\n"
-"  @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
-"      Enable/disable colored output. The default is @Gauto@D.\n"
-"  -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n"
-"      Don't print the elapsed time of each test.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G"
-    GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n"
-"      Generate a JSON or XML report in the given directory or with the given\n"
-"      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
+    "      Run only the tests whose name matches one of the positive patterns "
+    "but\n"
+    "      none of the negative patterns. '?' matches any single character; "
+    "'*'\n"
+    "      matches any substring; ':' separates two patterns.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "also_run_disabled_tests@D\n"
+    "      Run all disabled tests too.\n"
+    "\n"
+    "Test Execution:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "repeat=@Y[COUNT]@D\n"
+    "      Run the tests repeatedly; use a negative count to repeat forever.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "shuffle@D\n"
+    "      Randomize tests' orders on every iteration.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "random_seed=@Y[NUMBER]@D\n"
+    "      Random number seed to use for shuffling test orders (between 1 and\n"
+    "      99999, or 0 to use a seed based on the current time).\n"
+    "\n"
+    "Test Output:\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n"
+    "      Enable/disable colored output. The default is @Gauto@D.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "brief=1@D\n"
+    "      Only print test failures.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "print_time=0@D\n"
+    "      Don't print the elapsed time of each test.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "output=@Y(@Gjson@Y|@Gxml@Y)[@G:@YDIRECTORY_PATH@G" GTEST_PATH_SEP_
+    "@Y|@G:@YFILE_PATH]@D\n"
+    "      Generate a JSON or XML report in the given directory or with the "
+    "given\n"
+    "      file name. @YFILE_PATH@D defaults to @Gtest_detail.xml@D.\n"
 # if GTEST_CAN_STREAM_RESULTS_
-"  @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n"
-"      Stream test results to the given server.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "stream_result_to=@YHOST@G:@YPORT@D\n"
+    "      Stream test results to the given server.\n"
 # endif  // GTEST_CAN_STREAM_RESULTS_
-"\n"
-"Assertion Behavior:\n"
+    "\n"
+    "Assertion Behavior:\n"
 # if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
-"      Set the default death test style.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n"
+    "      Set the default death test style.\n"
 # endif  // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS
-"  @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n"
-"      Turn assertion failures into debugger break-points.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n"
-"      Turn assertion failures into C++ exceptions for use by an external\n"
-"      test framework.\n"
-"  @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n"
-"      Do not report exceptions as test failures. Instead, allow them\n"
-"      to crash the program or throw a pop-up (on Windows).\n"
-"\n"
-"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set "
+    "  @G--" GTEST_FLAG_PREFIX_
+    "break_on_failure@D\n"
+    "      Turn assertion failures into debugger break-points.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "throw_on_failure@D\n"
+    "      Turn assertion failures into C++ exceptions for use by an external\n"
+    "      test framework.\n"
+    "  @G--" GTEST_FLAG_PREFIX_
+    "catch_exceptions=0@D\n"
+    "      Do not report exceptions as test failures. Instead, allow them\n"
+    "      to crash the program or throw a pop-up (on Windows).\n"
+    "\n"
+    "Except for @G--" GTEST_FLAG_PREFIX_
+    "list_tests@D, you can alternatively set "
     "the corresponding\n"
-"environment variable of a flag (all letters in upper-case). For example, to\n"
-"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_
+    "environment variable of a flag (all letters in upper-case). For example, "
+    "to\n"
+    "disable colored text output, you can either specify "
+    "@G--" GTEST_FLAG_PREFIX_
     "color=no@D or set\n"
-"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n"
-"\n"
-"For more information, please read the " GTEST_NAME_ " documentation at\n"
-"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n"
-"(not one in your own code or tests), please report it to\n"
-"@G<" GTEST_DEV_EMAIL_ ">@D.\n";
+    "the @G" GTEST_FLAG_PREFIX_UPPER_
+    "COLOR@D environment variable to @Gno@D.\n"
+    "\n"
+    "For more information, please read the " GTEST_NAME_
+    " documentation at\n"
+    "@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_
+    "\n"
+    "(not one in your own code or tests), please report it to\n"
+    "@G<" GTEST_DEV_EMAIL_ ">@D.\n";
 
 static bool ParseGoogleTestFlag(const char* const arg) {
   return ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag,
                        &GTEST_FLAG(also_run_disabled_tests)) ||
-      ParseBoolFlag(arg, kBreakOnFailureFlag,
-                    &GTEST_FLAG(break_on_failure)) ||
-      ParseBoolFlag(arg, kCatchExceptionsFlag,
-                    &GTEST_FLAG(catch_exceptions)) ||
-      ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
-      ParseStringFlag(arg, kDeathTestStyleFlag,
-                      &GTEST_FLAG(death_test_style)) ||
-      ParseBoolFlag(arg, kDeathTestUseFork,
-                    &GTEST_FLAG(death_test_use_fork)) ||
-      ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
-      ParseStringFlag(arg, kInternalRunDeathTestFlag,
-                      &GTEST_FLAG(internal_run_death_test)) ||
-      ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
-      ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
-      ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
-      ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
-      ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
-      ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
-      ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
-      ParseInt32Flag(arg, kStackTraceDepthFlag,
-                     &GTEST_FLAG(stack_trace_depth)) ||
-      ParseStringFlag(arg, kStreamResultToFlag,
-                      &GTEST_FLAG(stream_result_to)) ||
-      ParseBoolFlag(arg, kThrowOnFailureFlag,
-                    &GTEST_FLAG(throw_on_failure));
+         ParseBoolFlag(arg, kBreakOnFailureFlag,
+                       &GTEST_FLAG(break_on_failure)) ||
+         ParseBoolFlag(arg, kCatchExceptionsFlag,
+                       &GTEST_FLAG(catch_exceptions)) ||
+         ParseStringFlag(arg, kColorFlag, &GTEST_FLAG(color)) ||
+         ParseStringFlag(arg, kDeathTestStyleFlag,
+                         &GTEST_FLAG(death_test_style)) ||
+         ParseBoolFlag(arg, kDeathTestUseFork,
+                       &GTEST_FLAG(death_test_use_fork)) ||
+         ParseBoolFlag(arg, kFailFast, &GTEST_FLAG(fail_fast)) ||
+         ParseStringFlag(arg, kFilterFlag, &GTEST_FLAG(filter)) ||
+         ParseStringFlag(arg, kInternalRunDeathTestFlag,
+                         &GTEST_FLAG(internal_run_death_test)) ||
+         ParseBoolFlag(arg, kListTestsFlag, &GTEST_FLAG(list_tests)) ||
+         ParseStringFlag(arg, kOutputFlag, &GTEST_FLAG(output)) ||
+         ParseBoolFlag(arg, kBriefFlag, &GTEST_FLAG(brief)) ||
+         ParseBoolFlag(arg, kPrintTimeFlag, &GTEST_FLAG(print_time)) ||
+         ParseBoolFlag(arg, kPrintUTF8Flag, &GTEST_FLAG(print_utf8)) ||
+         ParseInt32Flag(arg, kRandomSeedFlag, &GTEST_FLAG(random_seed)) ||
+         ParseInt32Flag(arg, kRepeatFlag, &GTEST_FLAG(repeat)) ||
+         ParseBoolFlag(arg, kShuffleFlag, &GTEST_FLAG(shuffle)) ||
+         ParseInt32Flag(arg, kStackTraceDepthFlag,
+                        &GTEST_FLAG(stack_trace_depth)) ||
+         ParseStringFlag(arg, kStreamResultToFlag,
+                         &GTEST_FLAG(stream_result_to)) ||
+         ParseBoolFlag(arg, kThrowOnFailureFlag, &GTEST_FLAG(throw_on_failure));
 }
 
 #if GTEST_USE_OWN_FLAGFILE_FLAG_
@@ -6314,24 +6694,31 @@
 std::string TempDir() {
 #if defined(GTEST_CUSTOM_TEMPDIR_FUNCTION_)
   return GTEST_CUSTOM_TEMPDIR_FUNCTION_();
-#endif
-
-#if GTEST_OS_WINDOWS_MOBILE
+#elif GTEST_OS_WINDOWS_MOBILE
   return "\\temp\\";
 #elif GTEST_OS_WINDOWS
   const char* temp_dir = internal::posix::GetEnv("TEMP");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "\\temp\\";
-  else if (temp_dir[strlen(temp_dir) - 1] == '\\')
+  } else if (temp_dir[strlen(temp_dir) - 1] == '\\') {
     return temp_dir;
-  else
+  } else {
     return std::string(temp_dir) + "\\";
+  }
 #elif GTEST_OS_LINUX_ANDROID
   const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
-  if (temp_dir == nullptr || temp_dir[0] == '\0')
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
     return "/data/local/tmp/";
-  else
+  } else {
     return temp_dir;
+  }
+#elif GTEST_OS_LINUX
+  const char* temp_dir = internal::posix::GetEnv("TEST_TMPDIR");
+  if (temp_dir == nullptr || temp_dir[0] == '\0') {
+    return "/tmp/";
+  } else {
+    return temp_dir;
+  }
 #else
   return "/tmp/";
 #endif  // GTEST_OS_WINDOWS_MOBILE
diff --git a/libvpx/third_party/libwebm/README.libvpx b/libvpx/third_party/libwebm/README.libvpx
index 1e87afd..325604c 100644
--- a/libvpx/third_party/libwebm/README.libvpx
+++ b/libvpx/third_party/libwebm/README.libvpx
@@ -1,5 +1,5 @@
 URL: https://chromium.googlesource.com/webm/libwebm
-Version: 37d9b860ebbf40cb0f6dcb7a6fef452d798062da
+Version: ee0bab576c338c9807249b99588e352b7268cb62
 License: BSD
 License File: LICENSE.txt
 
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
index 5120312..ae36531 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxer.cc
@@ -774,7 +774,7 @@
     return false;
 
   // AV1 tracks require a CodecPrivate. See
-  // https://github.com/Matroska-Org/matroska-specification/blob/av1-mappin/codec/av1.md
+  // https://github.com/ietf-wg-cellar/matroska-specification/blob/HEAD/codec/av1.md
   // TODO(tomfinegan): Update the above link to the AV1 Matroska mappings to
   // point to a stable version once it is finalized, or our own WebM mappings
   // page on webmproject.org should we decide to release them.
@@ -3084,6 +3084,7 @@
       accurate_cluster_duration_(false),
       fixed_size_cluster_timecode_(false),
       estimate_file_duration_(false),
+      ebml_header_size_(0),
       payload_pos_(0),
       size_position_(0),
       doc_type_version_(kDefaultDocTypeVersion),
@@ -4105,12 +4106,16 @@
     // places where |doc_type_version_| needs to be updated.
     if (frame->discard_padding() != 0)
       doc_type_version_ = 4;
-    if (!cluster->AddFrame(frame))
-      return -1;
+    if (!cluster->AddFrame(frame)) {
+      delete frame;
+      continue;
+    }
 
     if (new_cuepoint_ && cues_track_ == frame->track_number()) {
-      if (!AddCuePoint(frame->timestamp(), cues_track_))
-        return -1;
+      if (!AddCuePoint(frame->timestamp(), cues_track_)) {
+        delete frame;
+        continue;
+      }
     }
 
     if (frame->timestamp() > last_timestamp_) {
@@ -4153,12 +4158,16 @@
       const Frame* const frame_prev = frames_[i - 1];
       if (frame_prev->discard_padding() != 0)
         doc_type_version_ = 4;
-      if (!cluster->AddFrame(frame_prev))
-        return false;
+      if (!cluster->AddFrame(frame_prev)) {
+        delete frame_prev;
+        continue;
+      }
 
       if (new_cuepoint_ && cues_track_ == frame_prev->track_number()) {
-        if (!AddCuePoint(frame_prev->timestamp(), cues_track_))
-          return false;
+        if (!AddCuePoint(frame_prev->timestamp(), cues_track_)) {
+          delete frame_prev;
+          continue;
+        }
       }
 
       ++shift_left;
diff --git a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
index 6436817..bd2f769 100644
--- a/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
+++ b/libvpx/third_party/libwebm/mkvmuxer/mkvmuxerutil.cc
@@ -606,8 +606,8 @@
 
 void GetVersion(int32* major, int32* minor, int32* build, int32* revision) {
   *major = 0;
-  *minor = 2;
-  *build = 1;
+  *minor = 3;
+  *build = 0;
   *revision = 0;
 }
 
diff --git a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
index ace65bd..de8884b 100644
--- a/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
+++ b/libvpx/third_party/libwebm/mkvparser/mkvparser.cc
@@ -54,9 +54,9 @@
 
 void GetVersion(int& major, int& minor, int& build, int& revision) {
   major = 1;
-  minor = 0;
+  minor = 1;
   build = 0;
-  revision = 30;
+  revision = 0;
 }
 
 long long ReadUInt(IMkvReader* pReader, long long pos, long& len) {
@@ -1502,8 +1502,8 @@
 
   // first count the seek head entries
 
-  int entry_count = 0;
-  int void_element_count = 0;
+  long long entry_count = 0;
+  long long void_element_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -1513,10 +1513,15 @@
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvSeek)
+    if (id == libwebm::kMkvSeek) {
       ++entry_count;
-    else if (id == libwebm::kMkvVoid)
+      if (entry_count > INT_MAX)
+        return E_PARSE_FAILED;
+    } else if (id == libwebm::kMkvVoid) {
       ++void_element_count;
+      if (void_element_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
 
@@ -1528,14 +1533,15 @@
     return E_FILE_FORMAT_INVALID;
 
   if (entry_count > 0) {
-    m_entries = new (std::nothrow) Entry[entry_count];
+    m_entries = new (std::nothrow) Entry[static_cast<size_t>(entry_count)];
 
     if (m_entries == NULL)
       return -1;
   }
 
   if (void_element_count > 0) {
-    m_void_elements = new (std::nothrow) VoidElement[void_element_count];
+    m_void_elements =
+        new (std::nothrow) VoidElement[static_cast<size_t>(void_element_count)];
 
     if (m_void_elements == NULL)
       return -1;
@@ -1582,13 +1588,13 @@
 
   ptrdiff_t count_ = ptrdiff_t(pEntry - m_entries);
   assert(count_ >= 0);
-  assert(count_ <= entry_count);
+  assert(static_cast<long long>(count_) <= entry_count);
 
   m_entry_count = static_cast<int>(count_);
 
   count_ = ptrdiff_t(pVoidElement - m_void_elements);
   assert(count_ >= 0);
-  assert(count_ <= void_element_count);
+  assert(static_cast<long long>(count_) <= void_element_count);
 
   m_void_element_count = static_cast<int>(count_);
 
@@ -2299,7 +2305,7 @@
   long long pos = pos_;
 
   // First count number of track positions
-
+  unsigned long long track_positions_count = 0;
   while (pos < stop) {
     long len;
 
@@ -2323,12 +2329,17 @@
     if (id == libwebm::kMkvCueTime)
       m_timecode = UnserializeUInt(pReader, pos, size);
 
-    else if (id == libwebm::kMkvCueTrackPositions)
-      ++m_track_positions_count;
+    else if (id == libwebm::kMkvCueTrackPositions) {
+      ++track_positions_count;
+      if (track_positions_count > UINT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
   }
 
+  m_track_positions_count = static_cast<size_t>(track_positions_count);
+
   if (m_timecode < 0 || m_track_positions_count <= 0) {
     return false;
   }
@@ -4194,8 +4205,8 @@
   const long long stop = start + size;
 
   // Count ContentCompression and ContentEncryption elements.
-  int compression_count = 0;
-  int encryption_count = 0;
+  long long compression_count = 0;
+  long long encryption_count = 0;
 
   while (pos < stop) {
     long long id, size;
@@ -4203,11 +4214,17 @@
     if (status < 0)  // error
       return status;
 
-    if (id == libwebm::kMkvContentCompression)
+    if (id == libwebm::kMkvContentCompression) {
       ++compression_count;
+      if (compression_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
-    if (id == libwebm::kMkvContentEncryption)
+    if (id == libwebm::kMkvContentEncryption) {
       ++encryption_count;
+      if (encryption_count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4218,16 +4235,16 @@
     return -1;
 
   if (compression_count > 0) {
-    compression_entries_ =
-        new (std::nothrow) ContentCompression*[compression_count];
+    compression_entries_ = new (std::nothrow)
+        ContentCompression*[static_cast<size_t>(compression_count)];
     if (!compression_entries_)
       return -1;
     compression_entries_end_ = compression_entries_;
   }
 
   if (encryption_count > 0) {
-    encryption_entries_ =
-        new (std::nothrow) ContentEncryption*[encryption_count];
+    encryption_entries_ = new (std::nothrow)
+        ContentEncryption*[static_cast<size_t>(encryption_count)];
     if (!encryption_entries_) {
       delete[] compression_entries_;
       compression_entries_ = NULL;
@@ -4918,7 +4935,7 @@
   const long long stop = start + size;
 
   // Count ContentEncoding elements.
-  int count = 0;
+  long long count = 0;
   while (pos < stop) {
     long long id, size;
     const long status = ParseElementHeader(pReader, pos, stop, id, size);
@@ -4926,8 +4943,11 @@
       return status;
 
     // pos now designates start of element
-    if (id == libwebm::kMkvContentEncoding)
+    if (id == libwebm::kMkvContentEncoding) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -4937,7 +4957,8 @@
   if (count <= 0)
     return -1;
 
-  content_encoding_entries_ = new (std::nothrow) ContentEncoding*[count];
+  content_encoding_entries_ =
+      new (std::nothrow) ContentEncoding*[static_cast<size_t>(count)];
   if (!content_encoding_entries_)
     return -1;
 
@@ -5229,6 +5250,8 @@
 
       projection_ptr->type = static_cast<ProjectionType>(projection_type);
     } else if (child_id == libwebm::kMkvProjectionPrivate) {
+      if (projection_ptr->private_data != NULL)
+        return false;
       unsigned char* data = SafeArrayAlloc<unsigned char>(1, child_size);
 
       if (data == NULL)
@@ -5286,6 +5309,7 @@
       m_projection(NULL) {}
 
 VideoTrack::~VideoTrack() {
+  delete[] m_colour_space;
   delete m_colour;
   delete m_projection;
 }
@@ -5307,7 +5331,7 @@
   long long stereo_mode = 0;
 
   double rate = 0.0;
-  char* colour_space = NULL;
+  std::unique_ptr<char[]> colour_space_ptr;
 
   IMkvReader* const pReader = pSegment->m_pReader;
 
@@ -5384,9 +5408,11 @@
         projection_ptr.reset(projection);
       }
     } else if (id == libwebm::kMkvColourSpace) {
+      char* colour_space = NULL;
       const long status = UnserializeString(pReader, pos, size, colour_space);
       if (status < 0)
         return status;
+      colour_space_ptr.reset(colour_space);
     }
 
     pos += size;  // consume payload
@@ -5418,7 +5444,7 @@
   pTrack->m_stereo_mode = stereo_mode;
   pTrack->m_rate = rate;
   pTrack->m_colour = colour_ptr.release();
-  pTrack->m_colour_space = colour_space;
+  pTrack->m_colour_space = colour_space_ptr.release();
   pTrack->m_projection = projection_ptr.release();
 
   pResult = pTrack;
@@ -5648,7 +5674,7 @@
   const long long stop = m_start + m_size;
   IMkvReader* const pReader = m_pSegment->m_pReader;
 
-  int count = 0;
+  long long count = 0;
   long long pos = m_start;
 
   while (pos < stop) {
@@ -5662,8 +5688,11 @@
     if (size == 0)  // weird
       continue;
 
-    if (id == libwebm::kMkvTrackEntry)
+    if (id == libwebm::kMkvTrackEntry) {
       ++count;
+      if (count > INT_MAX)
+        return E_PARSE_FAILED;
+    }
 
     pos += size;  // consume payload
     if (pos > stop)
@@ -5676,7 +5705,7 @@
   if (count <= 0)
     return 0;  // success
 
-  m_trackEntries = new (std::nothrow) Track*[count];
+  m_trackEntries = new (std::nothrow) Track*[static_cast<size_t>(count)];
 
   if (m_trackEntries == NULL)
     return -1;
diff --git a/libvpx/tools/cpplint.py b/libvpx/tools/cpplint.py
index 25fbef7..e3ebde2 100755
--- a/libvpx/tools/cpplint.py
+++ b/libvpx/tools/cpplint.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 #
 # Copyright (c) 2009 Google Inc. All rights reserved.
 #
@@ -51,16 +51,23 @@
 import string
 import sys
 import unicodedata
+import sysconfig
+
+try:
+  xrange          # Python 2
+except NameError:
+  xrange = range  # Python 3
 
 
 _USAGE = """
 Syntax: cpplint.py [--verbose=#] [--output=vs7] [--filter=-x,+y,...]
                    [--counting=total|toplevel|detailed] [--root=subdir]
-                   [--linelength=digits]
+                   [--linelength=digits] [--headers=x,y,...]
+                   [--quiet]
         <file> [file] ...
 
   The style guidelines this tries to follow are those in
-    http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
+    https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml
 
   Every problem is given a confidence score from 1-5, with 5 meaning we are
   certain of the problem, and 1 meaning it could be a legitimate construct.
@@ -83,6 +90,9 @@
     verbose=#
       Specify a number 0-5 to restrict errors to certain verbosity levels.
 
+    quiet
+      Don't print anything if no errors are found.
+
     filter=-x,+y,...
       Specify a comma-separated list of category-filters to apply: only
       error messages whose category names pass the filters will be printed.
@@ -114,12 +124,13 @@
       ignored.
 
       Examples:
-        Assuing that src/.git exists, the header guard CPP variables for
-        src/chrome/browser/ui/browser.h are:
+        Assuming that top/src/.git exists (and cwd=top/src), the header guard
+        CPP variables for top/src/chrome/browser/ui/browser.h are:
 
         No flag => CHROME_BROWSER_UI_BROWSER_H_
         --root=chrome => BROWSER_UI_BROWSER_H_
         --root=chrome/browser => UI_BROWSER_H_
+        --root=.. => SRC_CHROME_BROWSER_UI_BROWSER_H_
 
     linelength=digits
       This is the allowed line length for the project. The default value is
@@ -133,6 +144,57 @@
 
       Examples:
         --extensions=hpp,cpp
+
+    headers=x,y,...
+      The header extensions that cpplint will treat as .h in checks. Values are
+      automatically added to --extensions list.
+
+      Examples:
+        --headers=hpp,hxx
+        --headers=hpp
+
+    cpplint.py supports per-directory configurations specified in CPPLINT.cfg
+    files. CPPLINT.cfg file can contain a number of key=value pairs.
+    Currently the following options are supported:
+
+      set noparent
+      filter=+filter1,-filter2,...
+      exclude_files=regex
+      linelength=80
+      root=subdir
+      headers=x,y,...
+
+    "set noparent" option prevents cpplint from traversing directory tree
+    upwards looking for more .cfg files in parent directories. This option
+    is usually placed in the top-level project directory.
+
+    The "filter" option is similar in function to --filter flag. It specifies
+    message filters in addition to the |_DEFAULT_FILTERS| and those specified
+    through --filter command-line flag.
+
+    "exclude_files" allows to specify a regular expression to be matched against
+    a file name. If the expression matches, the file is skipped and not run
+    through liner.
+
+    "linelength" allows to specify the allowed line length for the project.
+
+    The "root" option is similar in function to the --root flag (see example
+    above). Paths are relative to the directory of the CPPLINT.cfg.
+
+    The "headers" option is similar in function to the --headers flag
+    (see example above).
+
+    CPPLINT.cfg has an effect on files in the same directory and all
+    sub-directories, unless overridden by a nested configuration file.
+
+      Example file:
+        filter=-build/include_order,+build/include_alpha
+        exclude_files=.*\.cc
+
+    The above example disables build/include_order warning and enables
+    build/include_alpha as well as excludes all .cc from being
+    processed by linter, in the current directory (where the .cfg
+    file is located) and all sub-directories.
 """
 
 # We categorize each error message we print.  Here are the categories.
@@ -140,81 +202,101 @@
 # If you add a new error message with a new category, add it to the list
 # here!  cpplint_unittest.py should tell you if you forget to do this.
 _ERROR_CATEGORIES = [
-  'build/class',
-  'build/deprecated',
-  'build/endif_comment',
-  'build/explicit_make_pair',
-  'build/forward_decl',
-  'build/header_guard',
-  'build/include',
-  'build/include_alpha',
-  'build/include_order',
-  'build/include_what_you_use',
-  'build/namespaces',
-  'build/printf_format',
-  'build/storage_class',
-  'legal/copyright',
-  'readability/alt_tokens',
-  'readability/braces',
-  'readability/casting',
-  'readability/check',
-  'readability/constructors',
-  'readability/fn_size',
-  'readability/function',
-  'readability/multiline_comment',
-  'readability/multiline_string',
-  'readability/namespace',
-  'readability/nolint',
-  'readability/nul',
-  'readability/streams',
-  'readability/todo',
-  'readability/utf8',
-  'runtime/arrays',
-  'runtime/casting',
-  'runtime/explicit',
-  'runtime/int',
-  'runtime/init',
-  'runtime/invalid_increment',
-  'runtime/member_string_references',
-  'runtime/memset',
-  'runtime/operator',
-  'runtime/printf',
-  'runtime/printf_format',
-  'runtime/references',
-  'runtime/sizeof',
-  'runtime/string',
-  'runtime/threadsafe_fn',
-  'runtime/vlog',
-  'whitespace/blank_line',
-  'whitespace/braces',
-  'whitespace/comma',
-  'whitespace/comments',
-  'whitespace/empty_conditional_body',
-  'whitespace/empty_loop_body',
-  'whitespace/end_of_line',
-  'whitespace/ending_newline',
-  'whitespace/forcolon',
-  'whitespace/indent',
-  'whitespace/line_length',
-  'whitespace/newline',
-  'whitespace/operators',
-  'whitespace/parens',
-  'whitespace/semicolon',
-  'whitespace/tab',
-  'whitespace/todo'
-  ]
+    'build/class',
+    'build/c++11',
+    'build/c++14',
+    'build/c++tr1',
+    'build/deprecated',
+    'build/endif_comment',
+    'build/explicit_make_pair',
+    'build/forward_decl',
+    'build/header_guard',
+    'build/include',
+    'build/include_alpha',
+    'build/include_order',
+    'build/include_what_you_use',
+    'build/namespaces',
+    'build/printf_format',
+    'build/storage_class',
+    'legal/copyright',
+    'readability/alt_tokens',
+    'readability/braces',
+    'readability/casting',
+    'readability/check',
+    'readability/constructors',
+    'readability/fn_size',
+    'readability/inheritance',
+    'readability/multiline_comment',
+    'readability/multiline_string',
+    'readability/namespace',
+    'readability/nolint',
+    'readability/nul',
+    'readability/strings',
+    'readability/todo',
+    'readability/utf8',
+    'runtime/arrays',
+    'runtime/casting',
+    'runtime/explicit',
+    'runtime/int',
+    'runtime/init',
+    'runtime/invalid_increment',
+    'runtime/member_string_references',
+    'runtime/memset',
+    'runtime/indentation_namespace',
+    'runtime/operator',
+    'runtime/printf',
+    'runtime/printf_format',
+    'runtime/references',
+    'runtime/string',
+    'runtime/threadsafe_fn',
+    'runtime/vlog',
+    'whitespace/blank_line',
+    'whitespace/braces',
+    'whitespace/comma',
+    'whitespace/comments',
+    'whitespace/empty_conditional_body',
+    'whitespace/empty_if_body',
+    'whitespace/empty_loop_body',
+    'whitespace/end_of_line',
+    'whitespace/ending_newline',
+    'whitespace/forcolon',
+    'whitespace/indent',
+    'whitespace/line_length',
+    'whitespace/newline',
+    'whitespace/operators',
+    'whitespace/parens',
+    'whitespace/semicolon',
+    'whitespace/tab',
+    'whitespace/todo',
+    ]
 
-# The default state of the category filter. This is overrided by the --filter=
+# These error categories are no longer enforced by cpplint, but for backwards-
+# compatibility they may still appear in NOLINT comments.
+_LEGACY_ERROR_CATEGORIES = [
+    'readability/streams',
+    'readability/function',
+    ]
+
+# The default state of the category filter. This is overridden by the --filter=
 # flag. By default all errors are on, so only add here categories that should be
 # off by default (i.e., categories that must be enabled by the --filter= flags).
 # All entries here should start with a '-' or '+', as in the --filter= flag.
 _DEFAULT_FILTERS = ['-build/include_alpha']
 
+# The default list of categories suppressed for C (not C++) files.
+_DEFAULT_C_SUPPRESSED_CATEGORIES = [
+    'readability/casting',
+    ]
+
+# The default list of categories suppressed for Linux Kernel files.
+_DEFAULT_KERNEL_SUPPRESSED_CATEGORIES = [
+    'whitespace/tab',
+    ]
+
 # We used to check for high-bit characters, but after much discussion we
 # decided those were OK, as long as they were in UTF-8 and didn't represent
 # hard-coded international strings, which belong in a separate i18n file.
 
-
 # C++ headers
 _CPP_HEADERS = frozenset([
     # Legacy
@@ -304,6 +386,7 @@
     'random',
     'ratio',
     'regex',
+    'scoped_allocator',
     'set',
     'sstream',
     'stack',
@@ -351,15 +434,40 @@
     'cwctype',
     ])
 
+# Type names
+_TYPES = re.compile(
+    r'^(?:'
+    # [dcl.type.simple]
+    r'(char(16_t|32_t)?)|wchar_t|'
+    r'bool|short|int|long|signed|unsigned|float|double|'
+    # [support.types]
+    r'(ptrdiff_t|size_t|max_align_t|nullptr_t)|'
+    # [cstdint.syn]
+    r'(u?int(_fast|_least)?(8|16|32|64)_t)|'
+    r'(u?int(max|ptr)_t)|'
+    r')$')
+
+
+# These headers are excluded from [build/include] and [build/include_order]
+# checks:
+# - Anything not following google file name conventions (containing an
+#   uppercase character, such as Python.h or nsStringAPI.h, for example).
+# - Lua headers.
+_THIRD_PARTY_HEADERS_PATTERN = re.compile(
+    r'^(?:[^/]*[A-Z][^/]*\.h|lua\.h|lauxlib\.h|lualib\.h)$')
+
+# Pattern for matching FileInfo.BaseName() against test file name
+_TEST_FILE_SUFFIX = r'(_test|_unittest|_regtest)$'
+
+# Pattern that matches only complete whitespace, possibly across multiple lines.
+_EMPTY_CONDITIONAL_BODY_PATTERN = re.compile(r'^\s*$', re.DOTALL)
+
 # Assertion macros.  These are defined in base/logging.h and
-# testing/base/gunit.h.  Note that the _M versions need to come first
-# for substring matching to work.
+# testing/base/public/gunit.h.
 _CHECK_MACROS = [
     'DCHECK', 'CHECK',
-    'EXPECT_TRUE_M', 'EXPECT_TRUE',
-    'ASSERT_TRUE_M', 'ASSERT_TRUE',
-    'EXPECT_FALSE_M', 'EXPECT_FALSE',
-    'ASSERT_FALSE_M', 'ASSERT_FALSE',
+    'EXPECT_TRUE', 'ASSERT_TRUE',
+    'EXPECT_FALSE', 'ASSERT_FALSE',
     ]
 
 # Replacement macros for CHECK/DCHECK/EXPECT_TRUE/EXPECT_FALSE
@@ -372,16 +480,12 @@
   _CHECK_REPLACEMENT['CHECK'][op] = 'CHECK_%s' % replacement
   _CHECK_REPLACEMENT['EXPECT_TRUE'][op] = 'EXPECT_%s' % replacement
   _CHECK_REPLACEMENT['ASSERT_TRUE'][op] = 'ASSERT_%s' % replacement
-  _CHECK_REPLACEMENT['EXPECT_TRUE_M'][op] = 'EXPECT_%s_M' % replacement
-  _CHECK_REPLACEMENT['ASSERT_TRUE_M'][op] = 'ASSERT_%s_M' % replacement
 
 for op, inv_replacement in [('==', 'NE'), ('!=', 'EQ'),
                             ('>=', 'LT'), ('>', 'LE'),
                             ('<=', 'GT'), ('<', 'GE')]:
   _CHECK_REPLACEMENT['EXPECT_FALSE'][op] = 'EXPECT_%s' % inv_replacement
   _CHECK_REPLACEMENT['ASSERT_FALSE'][op] = 'ASSERT_%s' % inv_replacement
-  _CHECK_REPLACEMENT['EXPECT_FALSE_M'][op] = 'EXPECT_%s_M' % inv_replacement
-  _CHECK_REPLACEMENT['ASSERT_FALSE_M'][op] = 'ASSERT_%s_M' % inv_replacement
 
 # Alternative tokens and their replacements.  For full list, see section 2.5
 # Alternative tokens [lex.digraph] in the C++ standard.
@@ -430,12 +534,15 @@
                         r'(?:\s+(volatile|__volatile__))?'
                         r'\s*[{(]')
 
+# Match strings that indicate we're working on a C (not C++) file.
+_SEARCH_C_FILE = re.compile(r'\b(?:LINT_C_FILE|'
+                            r'vim?:\s*.*(\s*|:)filetype=c(\s*|:|$))')
+
+# Match string that indicates we're working on a Linux Kernel file.
+_SEARCH_KERNEL_FILE = re.compile(r'\b(?:LINT_KERNEL_FILE)')
 
 _regexp_compile_cache = {}
 
-# Finds occurrences of NOLINT or NOLINT(...).
-_RE_SUPPRESSION = re.compile(r'\bNOLINT\b(\([^)]*\))?')
-
 # {str, set(int)}: a map from error categories to sets of linenumbers
 # on which those errors are expected and should be suppressed.
 _error_suppressions = {}
@@ -443,6 +550,7 @@
 # The root directory used for deriving header guard CPP variable.
 # This is set by --root flag.
 _root = None
+_root_debug = False
 
 # The allowed line length of files.
 # This is set by --linelength flag.
@@ -452,8 +560,28 @@
 # This is set by --extensions flag.
 _valid_extensions = set(['cc', 'h', 'cpp', 'cu', 'cuh'])
 
+# Treat all headers starting with 'h' equally: .h, .hpp, .hxx etc.
+# This is set by --headers flag.
+_hpp_headers = set(['h'])
+
+# {str, bool}: a map from error categories to booleans which indicate if the
+# category should be suppressed for every line.
+_global_error_suppressions = {}
+
+def ProcessHppHeadersOption(val):
+  global _hpp_headers
+  try:
+    _hpp_headers = set(val.split(','))
+    # Automatically append to extensions list so it does not have to be set 2 times
+    _valid_extensions.update(_hpp_headers)
+  except ValueError:
+    PrintUsage('Header extensions must be comma separated list.')
+
+def IsHeaderExtension(file_extension):
+  return file_extension in _hpp_headers
+
 def ParseNolintSuppressions(filename, raw_line, linenum, error):
-  """Updates the global list of error-suppressions.
+  """Updates the global list of line error-suppressions.
 
   Parses any NOLINT comments on the current line, updating the global
   error_suppressions store.  Reports an error if the NOLINT comment
@@ -465,42 +593,67 @@
     linenum: int, the number of the current line.
     error: function, an error handler.
   """
-  # FIXME(adonovan): "NOLINT(" is misparsed as NOLINT(*).
-  matched = _RE_SUPPRESSION.search(raw_line)
+  matched = Search(r'\bNOLINT(NEXTLINE)?\b(\([^)]+\))?', raw_line)
   if matched:
-    category = matched.group(1)
+    if matched.group(1):
+      suppressed_line = linenum + 1
+    else:
+      suppressed_line = linenum
+    category = matched.group(2)
     if category in (None, '(*)'):  # => "suppress all"
-      _error_suppressions.setdefault(None, set()).add(linenum)
+      _error_suppressions.setdefault(None, set()).add(suppressed_line)
     else:
       if category.startswith('(') and category.endswith(')'):
         category = category[1:-1]
         if category in _ERROR_CATEGORIES:
-          _error_suppressions.setdefault(category, set()).add(linenum)
-        else:
+          _error_suppressions.setdefault(category, set()).add(suppressed_line)
+        elif category not in _LEGACY_ERROR_CATEGORIES:
           error(filename, linenum, 'readability/nolint', 5,
                 'Unknown NOLINT error category: %s' % category)
 
 
+def ProcessGlobalSuppresions(lines):
+  """Updates the list of global error suppressions.
+
+  Parses any lint directives in the file that have global effect.
+
+  Args:
+    lines: An array of strings, each representing a line of the file, with the
+           last element being empty if the file is terminated with a newline.
+  """
+  for line in lines:
+    if _SEARCH_C_FILE.search(line):
+      for category in _DEFAULT_C_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+    if _SEARCH_KERNEL_FILE.search(line):
+      for category in _DEFAULT_KERNEL_SUPPRESSED_CATEGORIES:
+        _global_error_suppressions[category] = True
+
+
 def ResetNolintSuppressions():
-  "Resets the set of NOLINT suppressions to empty."
+  """Resets the set of NOLINT suppressions to empty."""
   _error_suppressions.clear()
+  _global_error_suppressions.clear()
 
 
 def IsErrorSuppressedByNolint(category, linenum):
   """Returns true if the specified error category is suppressed on this line.
 
   Consults the global error_suppressions map populated by
-  ParseNolintSuppressions/ResetNolintSuppressions.
+  ParseNolintSuppressions/ProcessGlobalSuppresions/ResetNolintSuppressions.
 
   Args:
     category: str, the category of the error.
     linenum: int, the current line number.
   Returns:
-    bool, True iff the error should be suppressed due to a NOLINT comment.
+    bool, True iff the error should be suppressed due to a NOLINT comment or
+    global suppression.
   """
-  return (linenum in _error_suppressions.get(category, set()) or
+  return (_global_error_suppressions.get(category, False) or
+          linenum in _error_suppressions.get(category, set()) or
           linenum in _error_suppressions.get(None, set()))
 
+
 def Match(pattern, s):
   """Matches the string with the pattern, caching the compiled regexp."""
   # The regexp compilation caching is inlined in both Match and Search for
@@ -536,11 +689,17 @@
   return _regexp_compile_cache[pattern].search(s)
 
 
-class _IncludeState(dict):
+def _IsSourceExtension(s):
+  """File extension (excluding dot) matches a source file extension."""
+  return s in ('c', 'cc', 'cpp', 'cxx')
+
+
+class _IncludeState(object):
   """Tracks line numbers for includes, and the order in which includes appear.
 
-  As a dict, an _IncludeState object serves as a mapping between include
-  filename and line number on which that file was included.
+  include_list contains list of lists of (header, line number) pairs.
+  It's a lists of lists rather than just one flat list to make it
+  easier to update across preprocessor boundaries.
 
   Call CheckNextIncludeOrder() once for each header in the file, passing
   in the type constants defined above. Calls in an illegal order will
@@ -571,15 +730,42 @@
       }
 
   def __init__(self):
-    dict.__init__(self)
-    self.ResetSection()
+    self.include_list = [[]]
+    self.ResetSection('')
 
-  def ResetSection(self):
+  def FindHeader(self, header):
+    """Check if a header has already been included.
+
+    Args:
+      header: header to check.
+    Returns:
+      Line number of previous occurrence, or -1 if the header has not
+      been seen before.
+    """
+    for section_list in self.include_list:
+      for f in section_list:
+        if f[0] == header:
+          return f[1]
+    return -1
+
+  def ResetSection(self, directive):
+    """Reset section checking for preprocessor directive.
+
+    Args:
+      directive: preprocessor directive (e.g. "if", "else").
+    """
     # The name of the current section.
     self._section = self._INITIAL_SECTION
     # The path of last found header.
     self._last_header = ''
 
+    # Update list of includes.  Note that we never pop from the
+    # include list.
+    if directive in ('if', 'ifdef', 'ifndef'):
+      self.include_list.append([])
+    elif directive in ('else', 'elif'):
+      self.include_list[-1] = []
+
   def SetLastHeader(self, header_path):
     self._last_header = header_path
 
@@ -615,7 +801,7 @@
     # If previous line was a blank line, assume that the headers are
     # intentionally sorted the way they are.
     if (self._last_header > header_path and
-        not Match(r'^\s*$', clean_lines.elided[linenum - 1])):
+        Match(r'^\s*#\s*include\b', clean_lines.elided[linenum - 1])):
       return False
     return True
 
@@ -681,8 +867,11 @@
     self.error_count = 0    # global count of reported errors
     # filters to apply when emitting error messages
     self.filters = _DEFAULT_FILTERS[:]
+    # backup of filter list. Used to restore the state after each file.
+    self._filters_backup = self.filters[:]
     self.counting = 'total'  # In what way are we counting errors?
     self.errors_by_category = {}  # string to int dict storing error counts
+    self.quiet = False  # Suppress non-error messagess?
 
     # output format:
     # "emacs" - format that emacs can parse (default)
@@ -693,6 +882,12 @@
     """Sets the output format for errors."""
     self.output_format = output_format
 
+  def SetQuiet(self, quiet):
+    """Sets the module's quiet settings, and returns the previous setting."""
+    last_quiet = self.quiet
+    self.quiet = quiet
+    return last_quiet
+
   def SetVerboseLevel(self, level):
     """Sets the module's verbosity, and returns the previous setting."""
     last_verbose_level = self.verbose_level
@@ -719,6 +914,10 @@
     """
     # Default filters always have less priority than the flag ones.
     self.filters = _DEFAULT_FILTERS[:]
+    self.AddFilters(filters)
+
+  def AddFilters(self, filters):
+    """ Adds more filters to the existing list of error-message filters. """
     for filt in filters.split(','):
       clean_filt = filt.strip()
       if clean_filt:
@@ -728,6 +927,14 @@
         raise ValueError('Every filter in --filters must start with + or -'
                          ' (%s does not)' % filt)
 
+  def BackupFilters(self):
+    """ Saves the current filter list to backup storage."""
+    self._filters_backup = self.filters[:]
+
+  def RestoreFilters(self):
+    """ Restores filters previously backed up."""
+    self.filters = self._filters_backup[:]
+
   def ResetErrorCounts(self):
     """Sets the module's error statistic back to zero."""
     self.error_count = 0
@@ -748,7 +955,7 @@
     for category, count in self.errors_by_category.iteritems():
       sys.stderr.write('Category \'%s\' errors found: %d\n' %
                        (category, count))
-    sys.stderr.write('Total errors found: %d\n' % self.error_count)
+    sys.stdout.write('Total errors found: %d\n' % self.error_count)
 
 _cpplint_state = _CppLintState()
 
@@ -762,6 +969,14 @@
   """Sets the module's output format."""
   _cpplint_state.SetOutputFormat(output_format)
 
+def _Quiet():
+  """Return's the module's quiet setting."""
+  return _cpplint_state.quiet
+
+def _SetQuiet(quiet):
+  """Set the module's quiet status, and return previous setting."""
+  return _cpplint_state.SetQuiet(quiet)
+
 
 def _VerboseLevel():
   """Returns the module's verbosity setting."""
@@ -795,6 +1010,25 @@
   """
   _cpplint_state.SetFilters(filters)
 
+def _AddFilters(filters):
+  """Adds more filter overrides.
+
+  Unlike _SetFilters, this function does not reset the current list of filters
+  available.
+
+  Args:
+    filters: A string of comma-separated filters (eg "whitespace/indent").
+             Each filter should start with + or -; else we die.
+  """
+  _cpplint_state.AddFilters(filters)
+
+def _BackupFilters():
+  """ Saves the current filter list to backup storage."""
+  _cpplint_state.BackupFilters()
+
+def _RestoreFilters():
+  """ Restores filters previously backed up."""
+  _cpplint_state.RestoreFilters()
 
 class _FunctionState(object):
   """Tracks current function name and the number of lines in its body."""
@@ -830,6 +1064,9 @@
       filename: The name of the current file.
       linenum: The number of the line to check.
     """
+    if not self.in_a_function:
+      return
+
     if Match(r'T(EST|est)', self.current_function):
       base_trigger = self._TEST_TRIGGER
     else:
@@ -857,7 +1094,7 @@
   pass
 
 
-class FileInfo:
+class FileInfo(object):
   """Provides utility functions for filenames.
 
   FileInfo provides easy access to the components of a file's path
@@ -900,12 +1137,13 @@
 
       # Not SVN <= 1.6? Try to find a git, hg, or svn top level directory by
       # searching up from the current path.
-      root_dir = os.path.dirname(fullname)
-      while (root_dir != os.path.dirname(root_dir) and
-             not os.path.exists(os.path.join(root_dir, ".git")) and
-             not os.path.exists(os.path.join(root_dir, ".hg")) and
-             not os.path.exists(os.path.join(root_dir, ".svn"))):
-        root_dir = os.path.dirname(root_dir)
+      root_dir = current_dir = os.path.dirname(fullname)
+      while current_dir != os.path.dirname(current_dir):
+        if (os.path.exists(os.path.join(current_dir, ".git")) or
+            os.path.exists(os.path.join(current_dir, ".hg")) or
+            os.path.exists(os.path.join(current_dir, ".svn"))):
+          root_dir = current_dir
+        current_dir = os.path.dirname(current_dir)
 
       if (os.path.exists(os.path.join(root_dir, ".git")) or
           os.path.exists(os.path.join(root_dir, ".hg")) or
@@ -944,7 +1182,7 @@
 
   def IsSource(self):
     """File has a source file extension."""
-    return self.Extension()[1:] in ('c', 'cc', 'cpp', 'cxx')
+    return _IsSourceExtension(self.Extension()[1:])
 
 
 def _ShouldPrintError(category, confidence, linenum):
@@ -955,6 +1193,7 @@
   # the verbosity level isn't high enough, or the filters filter it out.
   if IsErrorSuppressedByNolint(category, linenum):
     return False
+
   if confidence < _cpplint_state.verbose_level:
     return False
 
@@ -999,8 +1238,8 @@
   if _ShouldPrintError(category, confidence, linenum):
     _cpplint_state.IncrementErrorCount(category)
     if _cpplint_state.output_format == 'vs7':
-      sys.stderr.write('%s(%s):  %s  [%s] [%d]\n' % (
-          filename, linenum, message, category, confidence))
+      sys.stderr.write('%s(%s): error cpplint: [%s] %s [%d]\n' % (
+          filename, linenum, category, message, confidence))
     elif _cpplint_state.output_format == 'eclipse':
       sys.stderr.write('%s:%s: warning: %s  [%s] [%d]\n' % (
           filename, linenum, message, category, confidence))
@@ -1012,11 +1251,9 @@
 # Matches standard C++ escape sequences per 2.13.2.3 of the C++ standard.
 _RE_PATTERN_CLEANSE_LINE_ESCAPES = re.compile(
     r'\\([abfnrtv?"\\\']|\d+|x[0-9a-fA-F]+)')
-# Matches strings.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES = re.compile(r'"[^"]*"')
-# Matches characters.  Escape codes should already be removed by ESCAPES.
-_RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES = re.compile(r"'.'")
-# Matches multi-line C++ comments.
+# Match a single C style comment on the same line.
+_RE_PATTERN_C_COMMENTS = r'/\*(?:[^*]|\*(?!/))*\*/'
+# Matches multi-line C style comments.
 # This RE is a little bit more complicated than one might expect, because we
 # have to take care of space removals tools so we can handle comments inside
 # statements better.
@@ -1025,10 +1262,10 @@
 # if this doesn't work we try on left side but only if there's a non-character
 # on the right.
 _RE_PATTERN_CLEANSE_LINE_C_COMMENTS = re.compile(
-    r"""(\s*/\*.*\*/\s*$|
-            /\*.*\*/\s+|
-         \s+/\*.*\*/(?=\W)|
-            /\*.*\*/)""", re.VERBOSE)
+    r'(\s*' + _RE_PATTERN_C_COMMENTS + r'\s*$|' +
+    _RE_PATTERN_C_COMMENTS + r'\s+|' +
+    r'\s+' + _RE_PATTERN_C_COMMENTS + r'(?=\W)|' +
+    _RE_PATTERN_C_COMMENTS + r')')
 
 
 def IsCppString(line):
@@ -1083,13 +1320,26 @@
         delimiter = None
       else:
         # Haven't found the end yet, append a blank line.
-        line = ''
+        line = '""'
 
-    else:
+    # Look for beginning of a raw string, and replace them with
+    # empty strings.  This is done in a loop to handle multiple raw
+    # strings on the same line.
+    while delimiter is None:
       # Look for beginning of a raw string.
       # See 2.14.15 [lex.string] for syntax.
-      matched = Match(r'^(.*)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
-      if matched:
+      #
+      # Once we have matched a raw string, we check the prefix of the
+      # line to make sure that the line is not part of a single line
+      # comment.  It's done this way because we remove raw strings
+      # before removing comments as opposed to removing comments
+      # before removing raw strings.  This is because there are some
+      # cpplint checks that requires the comments to be preserved, but
+      # we don't want to check comments that are inside raw strings.
+      matched = Match(r'^(.*?)\b(?:R|u8R|uR|UR|LR)"([^\s\\()]*)\((.*)$', line)
+      if (matched and
+          not Match(r'^([^\'"]|\'(\\.|[^\'])*\'|"(\\.|[^"])*")*//',
+                    matched.group(1))):
         delimiter = ')' + matched.group(2) + '"'
 
         end = matched.group(3).find(delimiter)
@@ -1101,6 +1351,8 @@
         else:
           # Start of a multi-line raw string
           line = matched.group(1) + '""'
+      else:
+        break
 
     lines_without_raw_strings.append(line)
 
@@ -1131,10 +1383,10 @@
 
 def RemoveMultiLineCommentsFromRange(lines, begin, end):
   """Clears a range of lines for multi-line comments."""
-  # Having // dummy comments makes the lines non-empty, so we will not get
+  # Having // <empty> comments makes the lines non-empty, so we will not get
   # unnecessary blank line warnings later in the code.
   for i in range(begin, end):
-    lines[i] = '// dummy'
+    lines[i] = '/**/'
 
 
 def RemoveMultiLineComments(filename, lines, error):
@@ -1170,12 +1422,14 @@
 
 
 class CleansedLines(object):
-  """Holds 3 copies of all lines with different preprocessing applied to them.
+  """Holds 4 copies of all lines with different preprocessing applied to them.
 
-  1) elided member contains lines without strings and comments,
-  2) lines member contains lines without comments, and
+  1) elided member contains lines without strings and comments.
+  2) lines member contains lines without comments.
   3) raw_lines member contains all the lines without processing.
-  All these three members are of <type 'list'>, and of the same length.
+  4) lines_without_raw_strings member is same as raw_lines, but with C++11 raw
+     strings removed.
+  All these members are of <type 'list'>, and of the same length.
   """
 
   def __init__(self, lines):
@@ -1206,38 +1460,138 @@
     Returns:
       The line with collapsed strings.
     """
-    if not _RE_PATTERN_INCLUDE.match(elided):
-      # Remove escaped characters first to make quote/single quote collapsing
-      # basic.  Things that look like escaped characters shouldn't occur
-      # outside of strings and chars.
-      elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_SINGLE_QUOTES.sub("''", elided)
-      elided = _RE_PATTERN_CLEANSE_LINE_DOUBLE_QUOTES.sub('""', elided)
-    return elided
+    if _RE_PATTERN_INCLUDE.match(elided):
+      return elided
+
+    # Remove escaped characters first to make quote/single quote collapsing
+    # basic.  Things that look like escaped characters shouldn't occur
+    # outside of strings and chars.
+    elided = _RE_PATTERN_CLEANSE_LINE_ESCAPES.sub('', elided)
+
+    # Replace quoted strings and digit separators.  Both single quotes
+    # and double quotes are processed in the same loop, otherwise
+    # nested quotes wouldn't work.
+    collapsed = ''
+    while True:
+      # Find the first quote character
+      match = Match(r'^([^\'"]*)([\'"])(.*)$', elided)
+      if not match:
+        collapsed += elided
+        break
+      head, quote, tail = match.groups()
+
+      if quote == '"':
+        # Collapse double quoted strings
+        second_quote = tail.find('"')
+        if second_quote >= 0:
+          collapsed += head + '""'
+          elided = tail[second_quote + 1:]
+        else:
+          # Unmatched double quote, don't bother processing the rest
+          # of the line since this is probably a multiline string.
+          collapsed += elided
+          break
+      else:
+        # Found single quote, check nearby text to eliminate digit separators.
+        #
+        # There is no special handling for floating point here, because
+        # the integer/fractional/exponent parts would all be parsed
+        # correctly as long as there are digits on both sides of the
+        # separator.  So we are fine as long as we don't see something
+        # like "0.'3" (gcc 4.9.0 will not allow this literal).
+        if Search(r'\b(?:0[bBxX]?|[1-9])[0-9a-fA-F]*$', head):
+          match_literal = Match(r'^((?:\'?[0-9a-zA-Z_])*)(.*)$', "'" + tail)
+          collapsed += head + match_literal.group(1).replace("'", '')
+          elided = match_literal.group(2)
+        else:
+          second_quote = tail.find('\'')
+          if second_quote >= 0:
+            collapsed += head + "''"
+            elided = tail[second_quote + 1:]
+          else:
+            # Unmatched single quote
+            collapsed += elided
+            break
+
+    return collapsed
 
 
-def FindEndOfExpressionInLine(line, startpos, depth, startchar, endchar):
-  """Find the position just after the matching endchar.
+def FindEndOfExpressionInLine(line, startpos, stack):
+  """Find the position just after the end of current parenthesized expression.
 
   Args:
     line: a CleansedLines line.
     startpos: start searching at this position.
-    depth: nesting level at startpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at startpos.
 
   Returns:
-    On finding matching endchar: (index just after matching endchar, 0)
-    Otherwise: (-1, new depth at end of this line)
+    On finding matching end: (index just after matching end, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at end of this line)
   """
   for i in xrange(startpos, len(line)):
-    if line[i] == startchar:
-      depth += 1
-    elif line[i] == endchar:
-      depth -= 1
-      if depth == 0:
-        return (i + 1, 0)
-  return (-1, depth)
+    char = line[i]
+    if char in '([{':
+      # Found start of parenthesized expression, push to expression stack
+      stack.append(char)
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        if stack and stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (-1, None)
+      elif i > 0 and Search(r'\boperator\s*$', line[0:i]):
+        # operator<, don't add to stack
+        continue
+      else:
+        # Tentative start of template argument list
+        stack.append('<')
+    elif char in ')]}':
+      # Found end of parenthesized expression.
+      #
+      # If we are currently expecting a matching '>', the pending '<'
+      # must have been an operator.  Remove them from expression stack.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((stack[-1] == '(' and char == ')') or
+          (stack[-1] == '[' and char == ']') or
+          (stack[-1] == '{' and char == '}')):
+        stack.pop()
+        if not stack:
+          return (i + 1, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == '>':
+      # Found potential end of template argument list.
+
+      # Ignore "->" and operator functions
+      if (i > 0 and
+          (line[i - 1] == '-' or Search(r'\boperator\s*$', line[0:i - 1]))):
+        continue
+
+      # Pop the stack if there is a matching '<'.  Otherwise, ignore
+      # this '>' since it must be an operator.
+      if stack:
+        if stack[-1] == '<':
+          stack.pop()
+          if not stack:
+            return (i + 1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '>', the matching '<' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '<':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+  # Did not find end of expression or unbalanced parentheses on this line
+  return (-1, stack)
 
 
 def CloseExpression(clean_lines, linenum, pos):
@@ -1246,6 +1600,11 @@
   If lines[linenum][pos] points to a '(' or '{' or '[' or '<', finds the
   linenum/pos that correspond to the closing of the expression.
 
+  TODO(unknown): cpplint spends a fair bit of time matching parentheses.
+  Ideally we would want to index all opening and closing parentheses once
+  and have CloseExpression be just a simple lookup, but due to preprocessor
+  tricks, this is not so easy.
+
   Args:
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
@@ -1259,35 +1618,28 @@
   """
 
   line = clean_lines.elided[linenum]
-  startchar = line[pos]
-  if startchar not in '({[<':
+  if (line[pos] not in '({[<') or Match(r'<[<=]', line[pos:]):
     return (line, clean_lines.NumLines(), -1)
-  if startchar == '(': endchar = ')'
-  if startchar == '[': endchar = ']'
-  if startchar == '{': endchar = '}'
-  if startchar == '<': endchar = '>'
 
   # Check first line
-  (end_pos, num_open) = FindEndOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (end_pos, stack) = FindEndOfExpressionInLine(line, pos, [])
   if end_pos > -1:
     return (line, linenum, end_pos)
 
   # Continue scanning forward
-  while linenum < clean_lines.NumLines() - 1:
+  while stack and linenum < clean_lines.NumLines() - 1:
     linenum += 1
     line = clean_lines.elided[linenum]
-    (end_pos, num_open) = FindEndOfExpressionInLine(
-        line, 0, num_open, startchar, endchar)
+    (end_pos, stack) = FindEndOfExpressionInLine(line, 0, stack)
     if end_pos > -1:
       return (line, linenum, end_pos)
 
-  # Did not find endchar before end of file, give up
+  # Did not find end of expression before end of file, give up
   return (line, clean_lines.NumLines(), -1)
 
 
-def FindStartOfExpressionInLine(line, endpos, depth, startchar, endchar):
-  """Find position at the matching startchar.
+def FindStartOfExpressionInLine(line, endpos, stack):
+  """Find position at the matching start of current expression.
 
   This is almost the reverse of FindEndOfExpressionInLine, but note
   that the input position and returned position differs by 1.
@@ -1295,22 +1647,72 @@
   Args:
     line: a CleansedLines line.
     endpos: start searching at this position.
-    depth: nesting level at endpos.
-    startchar: expression opening character.
-    endchar: expression closing character.
+    stack: nesting stack at endpos.
 
   Returns:
-    On finding matching startchar: (index at matching startchar, 0)
-    Otherwise: (-1, new depth at beginning of this line)
+    On finding matching start: (index at matching start, None)
+    On finding an unclosed expression: (-1, None)
+    Otherwise: (-1, new stack at beginning of this line)
   """
-  for i in xrange(endpos, -1, -1):
-    if line[i] == endchar:
-      depth += 1
-    elif line[i] == startchar:
-      depth -= 1
-      if depth == 0:
-        return (i, 0)
-  return (-1, depth)
+  i = endpos
+  while i >= 0:
+    char = line[i]
+    if char in ')]}':
+      # Found end of expression, push to expression stack
+      stack.append(char)
+    elif char == '>':
+      # Found potential end of template argument list.
+      #
+      # Ignore it if it's a "->" or ">=" or "operator>"
+      if (i > 0 and
+          (line[i - 1] == '-' or
+           Match(r'\s>=\s', line[i - 1:]) or
+           Search(r'\boperator\s*$', line[0:i]))):
+        i -= 1
+      else:
+        stack.append('>')
+    elif char == '<':
+      # Found potential start of template argument list
+      if i > 0 and line[i - 1] == '<':
+        # Left shift operator
+        i -= 1
+      else:
+        # If there is a matching '>', we can pop the expression stack.
+        # Otherwise, ignore this '<' since it must be an operator.
+        if stack and stack[-1] == '>':
+          stack.pop()
+          if not stack:
+            return (i, None)
+    elif char in '([{':
+      # Found start of expression.
+      #
+      # If there are any unmatched '>' on the stack, they must be
+      # operators.  Remove those.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+      if ((char == '(' and stack[-1] == ')') or
+          (char == '[' and stack[-1] == ']') or
+          (char == '{' and stack[-1] == '}')):
+        stack.pop()
+        if not stack:
+          return (i, None)
+      else:
+        # Mismatched parentheses
+        return (-1, None)
+    elif char == ';':
+      # Found something that look like end of statements.  If we are currently
+      # expecting a '<', the matching '>' must have been an operator, since
+      # template argument list should not contain statements.
+      while stack and stack[-1] == '>':
+        stack.pop()
+      if not stack:
+        return (-1, None)
+
+    i -= 1
+
+  return (-1, stack)
 
 
 def ReverseCloseExpression(clean_lines, linenum, pos):
@@ -1331,30 +1733,23 @@
     return is the 'cleansed' line at linenum.
   """
   line = clean_lines.elided[linenum]
-  endchar = line[pos]
-  if endchar not in ')}]>':
+  if line[pos] not in ')}]>':
     return (line, 0, -1)
-  if endchar == ')': startchar = '('
-  if endchar == ']': startchar = '['
-  if endchar == '}': startchar = '{'
-  if endchar == '>': startchar = '<'
 
   # Check last line
-  (start_pos, num_open) = FindStartOfExpressionInLine(
-      line, pos, 0, startchar, endchar)
+  (start_pos, stack) = FindStartOfExpressionInLine(line, pos, [])
   if start_pos > -1:
     return (line, linenum, start_pos)
 
   # Continue scanning backward
-  while linenum > 0:
+  while stack and linenum > 0:
     linenum -= 1
     line = clean_lines.elided[linenum]
-    (start_pos, num_open) = FindStartOfExpressionInLine(
-        line, len(line) - 1, num_open, startchar, endchar)
+    (start_pos, stack) = FindStartOfExpressionInLine(line, len(line) - 1, stack)
     if start_pos > -1:
       return (line, linenum, start_pos)
 
-  # Did not find startchar before beginning of file, give up
+  # Did not find start of expression before beginning of file, give up
   return (line, 0, -1)
 
 
@@ -1362,7 +1757,7 @@
   """Logs an error if no Copyright message appears at the top of the file."""
 
   # We'll say it should occur by line 10. Don't forget there's a
-  # dummy line at the front.
+  # placeholder line at the front.
   for line in xrange(1, min(len(lines), 11)):
     if re.search(r'Copyright', lines[line], re.I): break
   else:                       # means no copyright line was found
@@ -1371,6 +1766,46 @@
           'You should have a line: "Copyright [year] <Copyright Owner>"')
 
 
+def GetIndentLevel(line):
+  """Return the number of leading spaces in line.
+
+  Args:
+    line: A string to check.
+
+  Returns:
+    An integer count of leading spaces, possibly zero.
+  """
+  indent = Match(r'^( *)\S', line)
+  if indent:
+    return len(indent.group(1))
+  else:
+    return 0
+
+def PathSplitToList(path):
+  """Returns the path split into a list by the separator.
+
+  Args:
+    path: An absolute or relative path (e.g. '/a/b/c/' or '../a')
+
+  Returns:
+    A list of path components (e.g. ['a', 'b', 'c]).
+  """
+  lst = []
+  while True:
+    (head, tail) = os.path.split(path)
+    if head == path: # absolute paths end
+      lst.append(head)
+      break
+    if tail == path: # relative paths end
+      lst.append(tail)
+      break
+
+    path = head
+    lst.append(tail)
+
+  lst.reverse()
+  return lst
+
 def GetHeaderGuardCPPVariable(filename):
   """Returns the CPP variable that should be used as a header guard.
 
@@ -1387,15 +1822,67 @@
   # flymake.
   filename = re.sub(r'_flymake\.h$', '.h', filename)
   filename = re.sub(r'/\.flymake/([^/]*)$', r'/\1', filename)
+  # Replace 'c++' with 'cpp'.
+  filename = filename.replace('C++', 'cpp').replace('c++', 'cpp')
 
   fileinfo = FileInfo(filename)
   file_path_from_root = fileinfo.RepositoryName()
-  if _root:
-    file_path_from_root = re.sub('^' + _root + os.sep, '', file_path_from_root)
-  return re.sub(r'[-./\s]', '_', file_path_from_root).upper() + '_'
+
+  def FixupPathFromRoot():
+    if _root_debug:
+      sys.stderr.write("\n_root fixup, _root = '%s', repository name = '%s'\n"
+          %(_root, fileinfo.RepositoryName()))
+
+    # Process the file path with the --root flag if it was set.
+    if not _root:
+      if _root_debug:
+        sys.stderr.write("_root unspecified\n")
+      return file_path_from_root
+
+    def StripListPrefix(lst, prefix):
+      # f(['x', 'y'], ['w, z']) -> None  (not a valid prefix)
+      if lst[:len(prefix)] != prefix:
+        return None
+      # f(['a, 'b', 'c', 'd'], ['a', 'b']) -> ['c', 'd']
+      return lst[(len(prefix)):]
+
+    # root behavior:
+    #   --root=subdir , lstrips subdir from the header guard
+    maybe_path = StripListPrefix(PathSplitToList(file_path_from_root),
+                                 PathSplitToList(_root))
+
+    if _root_debug:
+      sys.stderr.write(("_root lstrip (maybe_path=%s, file_path_from_root=%s," +
+          " _root=%s)\n") %(maybe_path, file_path_from_root, _root))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    #   --root=.. , will prepend the outer directory to the header guard
+    full_path = fileinfo.FullName()
+    root_abspath = os.path.abspath(_root)
+
+    maybe_path = StripListPrefix(PathSplitToList(full_path),
+                                 PathSplitToList(root_abspath))
+
+    if _root_debug:
+      sys.stderr.write(("_root prepend (maybe_path=%s, full_path=%s, " +
+          "root_abspath=%s)\n") %(maybe_path, full_path, root_abspath))
+
+    if maybe_path:
+      return os.path.join(*maybe_path)
+
+    if _root_debug:
+      sys.stderr.write("_root ignore, returning %s\n" %(file_path_from_root))
+
+    #   --root=FAKE_DIR is ignored
+    return file_path_from_root
+
+  file_path_from_root = FixupPathFromRoot()
+  return re.sub(r'[^a-zA-Z0-9]', '_', file_path_from_root).upper() + '_'
 
 
-def CheckForHeaderGuard(filename, lines, error):
+def CheckForHeaderGuard(filename, clean_lines, error):
   """Checks that the file contains a header guard.
 
   Logs an error if no #ifndef header guard is present.  For other
@@ -1403,18 +1890,29 @@
 
   Args:
     filename: The name of the C++ header file.
-    lines: An array of strings, each representing a line of the file.
+    clean_lines: A CleansedLines instance containing the file.
     error: The function to call with any errors found.
   """
 
+  # Don't check for header guards if there are error suppression
+  # comments somewhere in this file.
+  #
+  # Because this is silencing a warning for a nonexistent line, we
+  # only support the very specific NOLINT(build/header_guard) syntax,
+  # and not the general NOLINT or NOLINT(*) syntax.
+  raw_lines = clean_lines.lines_without_raw_strings
+  for i in raw_lines:
+    if Search(r'//\s*NOLINT\(build/header_guard\)', i):
+      return
+
   cppvar = GetHeaderGuardCPPVariable(filename)
 
-  ifndef = None
+  ifndef = ''
   ifndef_linenum = 0
-  define = None
-  endif = None
+  define = ''
+  endif = ''
   endif_linenum = 0
-  for linenum, line in enumerate(lines):
+  for linenum, line in enumerate(raw_lines):
     linesplit = line.split()
     if len(linesplit) >= 2:
       # find the first occurrence of #ifndef and #define, save arg
@@ -1429,18 +1927,12 @@
       endif = line
       endif_linenum = linenum
 
-  if not ifndef:
+  if not ifndef or not define or ifndef != define:
     error(filename, 0, 'build/header_guard', 5,
           'No #ifndef header guard found, suggested CPP variable is: %s' %
           cppvar)
     return
 
-  if not define:
-    error(filename, 0, 'build/header_guard', 5,
-          'No #define header guard found, suggested CPP variable is: %s' %
-          cppvar)
-    return
-
   # The guard should be PATH_FILE_H_, but we also allow PATH_FILE_H__
   # for backward compatibility.
   if ifndef != cppvar:
@@ -1448,26 +1940,69 @@
     if ifndef != cppvar + '_':
       error_level = 5
 
-    ParseNolintSuppressions(filename, lines[ifndef_linenum], ifndef_linenum,
+    ParseNolintSuppressions(filename, raw_lines[ifndef_linenum], ifndef_linenum,
                             error)
     error(filename, ifndef_linenum, 'build/header_guard', error_level,
           '#ifndef header guard has wrong style, please use: %s' % cppvar)
 
-  if define != ifndef:
-    error(filename, 0, 'build/header_guard', 5,
-          '#ifndef and #define don\'t match, suggested CPP variable is: %s' %
-          cppvar)
+  # Check for "//" comments on endif line.
+  ParseNolintSuppressions(filename, raw_lines[endif_linenum], endif_linenum,
+                          error)
+  match = Match(r'#endif\s*//\s*' + cppvar + r'(_)?\b', endif)
+  if match:
+    if match.group(1) == '_':
+      # Issue low severity warning for deprecated double trailing underscore
+      error(filename, endif_linenum, 'build/header_guard', 0,
+            '#endif line should be "#endif  // %s"' % cppvar)
     return
 
-  if endif != ('#endif  // %s' % cppvar):
-    error_level = 0
-    if endif != ('#endif  // %s' % (cppvar + '_')):
-      error_level = 5
+  # Didn't find the corresponding "//" comment.  If this file does not
+  # contain any "//" comments at all, it could be that the compiler
+  # only wants "/**/" comments, look for those instead.
+  no_single_line_comments = True
+  for i in xrange(1, len(raw_lines) - 1):
+    line = raw_lines[i]
+    if Match(r'^(?:(?:\'(?:\.|[^\'])*\')|(?:"(?:\.|[^"])*")|[^\'"])*//', line):
+      no_single_line_comments = False
+      break
 
-    ParseNolintSuppressions(filename, lines[endif_linenum], endif_linenum,
-                            error)
-    error(filename, endif_linenum, 'build/header_guard', error_level,
-          '#endif line should be "#endif  // %s"' % cppvar)
+  if no_single_line_comments:
+    match = Match(r'#endif\s*/\*\s*' + cppvar + r'(_)?\s*\*/', endif)
+    if match:
+      if match.group(1) == '_':
+        # Low severity warning for double trailing underscore
+        error(filename, endif_linenum, 'build/header_guard', 0,
+              '#endif line should be "#endif  /* %s */"' % cppvar)
+      return
+
+  # Didn't find anything
+  error(filename, endif_linenum, 'build/header_guard', 5,
+        '#endif line should be "#endif  // %s"' % cppvar)
+
+
+def CheckHeaderFileIncluded(filename, include_state, error):
+  """Logs an error if a .cc file does not include its header."""
+
+  # Do not check test files
+  fileinfo = FileInfo(filename)
+  if Search(_TEST_FILE_SUFFIX, fileinfo.BaseName()):
+    return
+
+  headerfile = filename[0:len(filename) - len(fileinfo.Extension())] + '.h'
+  if not os.path.exists(headerfile):
+    return
+  headername = FileInfo(headerfile).RepositoryName()
+  first_include = 0
+  for section_list in include_state.include_list:
+    for f in section_list:
+      if headername in f[0] or f[0] in headername:
+        return
+      if not first_include:
+        first_include = f[1]
+
+  error(filename, first_include, 'build/include', 5,
+        '%s should include its header file %s' % (fileinfo.RepositoryName(),
+                                                  headername))
 
 
 def CheckForBadCharacters(filename, lines, error):
@@ -1551,19 +2086,33 @@
           'Use C++11 raw strings or concatenation instead.')
 
 
-threading_list = (
-    ('asctime(', 'asctime_r('),
-    ('ctime(', 'ctime_r('),
-    ('getgrgid(', 'getgrgid_r('),
-    ('getgrnam(', 'getgrnam_r('),
-    ('getlogin(', 'getlogin_r('),
-    ('getpwnam(', 'getpwnam_r('),
-    ('getpwuid(', 'getpwuid_r('),
-    ('gmtime(', 'gmtime_r('),
-    ('localtime(', 'localtime_r('),
-    ('rand(', 'rand_r('),
-    ('strtok(', 'strtok_r('),
-    ('ttyname(', 'ttyname_r('),
+# (non-threadsafe name, thread-safe alternative, validation pattern)
+#
+# The validation pattern is used to eliminate false positives such as:
+#  _rand();               // false positive due to substring match.
+#  ->rand();              // some member function rand().
+#  ACMRandom rand(seed);  // some variable named rand.
+#  ISAACRandom rand();    // another variable named rand.
+#
+# Basically we require the return value of these functions to be used
+# in some expression context on the same line by matching on some
+# operator before the function name.  This eliminates constructors and
+# member function calls.
+_UNSAFE_FUNC_PREFIX = r'(?:[-+*/=%^&|(<]\s*|>\s+)'
+_THREADING_LIST = (
+    ('asctime(', 'asctime_r(', _UNSAFE_FUNC_PREFIX + r'asctime\([^)]+\)'),
+    ('ctime(', 'ctime_r(', _UNSAFE_FUNC_PREFIX + r'ctime\([^)]+\)'),
+    ('getgrgid(', 'getgrgid_r(', _UNSAFE_FUNC_PREFIX + r'getgrgid\([^)]+\)'),
+    ('getgrnam(', 'getgrnam_r(', _UNSAFE_FUNC_PREFIX + r'getgrnam\([^)]+\)'),
+    ('getlogin(', 'getlogin_r(', _UNSAFE_FUNC_PREFIX + r'getlogin\(\)'),
+    ('getpwnam(', 'getpwnam_r(', _UNSAFE_FUNC_PREFIX + r'getpwnam\([^)]+\)'),
+    ('getpwuid(', 'getpwuid_r(', _UNSAFE_FUNC_PREFIX + r'getpwuid\([^)]+\)'),
+    ('gmtime(', 'gmtime_r(', _UNSAFE_FUNC_PREFIX + r'gmtime\([^)]+\)'),
+    ('localtime(', 'localtime_r(', _UNSAFE_FUNC_PREFIX + r'localtime\([^)]+\)'),
+    ('rand(', 'rand_r(', _UNSAFE_FUNC_PREFIX + r'rand\(\)'),
+    ('strtok(', 'strtok_r(',
+     _UNSAFE_FUNC_PREFIX + r'strtok\([^)]+\)'),
+    ('ttyname(', 'ttyname_r(', _UNSAFE_FUNC_PREFIX + r'ttyname\([^)]+\)'),
     )
 
 
@@ -1583,14 +2132,13 @@
     error: The function to call with any errors found.
   """
   line = clean_lines.elided[linenum]
-  for single_thread_function, multithread_safe_function in threading_list:
-    ix = line.find(single_thread_function)
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if ix >= 0 and (ix == 0 or (not line[ix - 1].isalnum() and
-                                line[ix - 1] not in ('_', '.', '>'))):
+  for single_thread_func, multithread_safe_func, pattern in _THREADING_LIST:
+    # Additional pattern matching check to confirm that this is the
+    # function we are looking for
+    if Search(pattern, line):
       error(filename, linenum, 'runtime/threadsafe_fn', 2,
-            'Consider using ' + multithread_safe_function +
-            '...) instead of ' + single_thread_function +
+            'Consider using ' + multithread_safe_func +
+            '...) instead of ' + single_thread_func +
             '...) for improved thread safety.')
 
 
@@ -1612,7 +2160,6 @@
           'VLOG() should be used with numeric verbosity level.  '
           'Use LOG() if you want symbolic severity levels.')
 
-
 # Matches invalid increment: *count++, which moves pointer instead of
 # incrementing a value.
 _RE_PATTERN_INVALID_INCREMENT = re.compile(
@@ -1641,13 +2188,29 @@
           'Changing pointer instead of value (or unused value of operator*).')
 
 
+def IsMacroDefinition(clean_lines, linenum):
+  if Search(r'^#define', clean_lines[linenum]):
+    return True
+
+  if linenum > 0 and Search(r'\\$', clean_lines[linenum - 1]):
+    return True
+
+  return False
+
+
+def IsForwardClassDeclaration(clean_lines, linenum):
+  return Match(r'^\s*(\btemplate\b)*.*class\s+\w+;\s*$', clean_lines[linenum])
+
+
 class _BlockInfo(object):
   """Stores information about a generic block of code."""
 
-  def __init__(self, seen_open_brace):
+  def __init__(self, linenum, seen_open_brace):
+    self.starting_linenum = linenum
     self.seen_open_brace = seen_open_brace
     self.open_parentheses = 0
     self.inline_asm = _NO_ASM
+    self.check_namespace_indentation = False
 
   def CheckBegin(self, filename, clean_lines, linenum, error):
     """Run checks that applies to text up to the opening brace.
@@ -1677,15 +2240,33 @@
     """
     pass
 
+  def IsBlockInfo(self):
+    """Returns true if this block is a _BlockInfo.
+
+    This is convenient for verifying that an object is an instance of
+    a _BlockInfo, but not an instance of any of the derived classes.
+
+    Returns:
+      True for this class, False for derived classes.
+    """
+    return self.__class__ == _BlockInfo
+
+
+class _ExternCInfo(_BlockInfo):
+  """Stores information about an 'extern "C"' block."""
+
+  def __init__(self, linenum):
+    _BlockInfo.__init__(self, linenum, True)
+
 
 class _ClassInfo(_BlockInfo):
   """Stores information about a class."""
 
   def __init__(self, name, class_or_struct, clean_lines, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name
-    self.starting_linenum = linenum
     self.is_derived = False
+    self.check_namespace_indentation = True
     if class_or_struct == 'struct':
       self.access = 'public'
       self.is_struct = True
@@ -1695,11 +2276,7 @@
 
     # Remember initial indentation level for this class.  Using raw_lines here
     # instead of elided to account for leading comments.
-    initial_indent = Match(r'^( *)\S', clean_lines.raw_lines[linenum])
-    if initial_indent:
-      self.class_indent = len(initial_indent.group(1))
-    else:
-      self.class_indent = 0
+    self.class_indent = GetIndentLevel(clean_lines.raw_lines[linenum])
 
     # Try to find the end of the class.  This will be confused by things like:
     #   class A {
@@ -1721,6 +2298,23 @@
       self.is_derived = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
+    # If there is a DISALLOW macro, it should appear near the end of
+    # the class.
+    seen_last_thing_in_class = False
+    for i in xrange(linenum - 1, self.starting_linenum, -1):
+      match = Search(
+          r'\b(DISALLOW_COPY_AND_ASSIGN|DISALLOW_IMPLICIT_CONSTRUCTORS)\(' +
+          self.name + r'\)',
+          clean_lines.elided[i])
+      if match:
+        if seen_last_thing_in_class:
+          error(filename, i, 'readability/constructors', 3,
+                match.group(1) + ' should be the last thing in the class')
+        break
+
+      if not Match(r'^\s*$', clean_lines.elided[i]):
+        seen_last_thing_in_class = True
+
     # Check that closing brace is aligned with beginning of the class.
     # Only do this if the closing brace is indented by only whitespaces.
     # This means we will not check single-line class definitions.
@@ -1738,9 +2332,9 @@
   """Stores information about a namespace."""
 
   def __init__(self, name, linenum):
-    _BlockInfo.__init__(self, False)
+    _BlockInfo.__init__(self, linenum, False)
     self.name = name or ''
-    self.starting_linenum = linenum
+    self.check_namespace_indentation = True
 
   def CheckEnd(self, filename, clean_lines, linenum, error):
     """Check end of namespace comments."""
@@ -1758,7 +2352,7 @@
     # deciding what these nontrivial things are, so this check is
     # triggered by namespace size only, which works most of the time.
     if (linenum - self.starting_linenum < 10
-        and not Match(r'};*\s*(//|/\*).*\bnamespace\b', line)):
+        and not Match(r'^\s*};*\s*(//|/\*).*\bnamespace\b', line)):
       return
 
     # Look for matching comment at end of namespace.
@@ -1775,17 +2369,24 @@
     # expected namespace.
     if self.name:
       # Named namespace
-      if not Match((r'};*\s*(//|/\*).*\bnamespace\s+' + re.escape(self.name) +
-                    r'[\*/\.\\\s]*$'),
+      if not Match((r'^\s*};*\s*(//|/\*).*\bnamespace\s+' +
+                    re.escape(self.name) + r'[\*/\.\\\s]*$'),
                    line):
         error(filename, linenum, 'readability/namespace', 5,
               'Namespace should be terminated with "// namespace %s"' %
               self.name)
     else:
       # Anonymous namespace
-      if not Match(r'};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
-        error(filename, linenum, 'readability/namespace', 5,
-              'Namespace should be terminated with "// namespace"')
+      if not Match(r'^\s*};*\s*(//|/\*).*\bnamespace[\*/\.\\\s]*$', line):
+        # If "// namespace anonymous" or "// anonymous namespace (more text)",
+        # mention "// anonymous namespace" as an acceptable form
+        if Match(r'^\s*}.*\b(namespace anonymous|anonymous namespace)\b', line):
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"'
+                ' or "// anonymous namespace"')
+        else:
+          error(filename, linenum, 'readability/namespace', 5,
+                'Anonymous namespace should be terminated with "// namespace"')
 
 
 class _PreprocessorInfo(object):
@@ -1802,7 +2403,7 @@
     self.seen_else = False
 
 
-class _NestingState(object):
+class NestingState(object):
   """Holds states related to parsing braces."""
 
   def __init__(self):
@@ -1814,6 +2415,17 @@
     # - _BlockInfo: some other type of block.
     self.stack = []
 
+    # Top of the previous stack before each Update().
+    #
+    # Because the nesting_stack is updated at the end of each line, we
+    # had to do some convoluted checks to find out what is the current
+    # scope at the beginning of the line.  This check is simplified by
+    # saving the previous top of nesting stack.
+    #
+    # We could save the full stack, but we only need the top.  Copying
+    # the full nesting stack would slow down cpplint by ~10%.
+    self.previous_stack_top = []
+
     # Stack of _PreprocessorInfo objects.
     self.pp_stack = []
 
@@ -1834,6 +2446,82 @@
     """
     return self.stack and isinstance(self.stack[-1], _NamespaceInfo)
 
+  def InExternC(self):
+    """Check if we are currently one level inside an 'extern "C"' block.
+
+    Returns:
+      True if top of the stack is an extern block, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ExternCInfo)
+
+  def InClassDeclaration(self):
+    """Check if we are currently one level inside a class or struct declaration.
+
+    Returns:
+      True if top of the stack is a class/struct, False otherwise.
+    """
+    return self.stack and isinstance(self.stack[-1], _ClassInfo)
+
+  def InAsmBlock(self):
+    """Check if we are currently one level inside an inline ASM block.
+
+    Returns:
+      True if the top of the stack is a block containing inline ASM.
+    """
+    return self.stack and self.stack[-1].inline_asm != _NO_ASM
+
+  def InTemplateArgumentList(self, clean_lines, linenum, pos):
+    """Check if current position is inside template argument list.
+
+    Args:
+      clean_lines: A CleansedLines instance containing the file.
+      linenum: The number of the line to check.
+      pos: position just after the suspected template argument.
+    Returns:
+      True if (linenum, pos) is inside template arguments.
+    """
+    while linenum < clean_lines.NumLines():
+      # Find the earliest character that might indicate a template argument
+      line = clean_lines.elided[linenum]
+      match = Match(r'^[^{};=\[\]\.<>]*(.)', line[pos:])
+      if not match:
+        linenum += 1
+        pos = 0
+        continue
+      token = match.group(1)
+      pos += len(match.group(0))
+
+      # These things do not look like template argument list:
+      #   class Suspect {
+      #   class Suspect x; }
+      if token in ('{', '}', ';'): return False
+
+      # These things look like template argument list:
+      #   template <class Suspect>
+      #   template <class Suspect = default_value>
+      #   template <class Suspect[]>
+      #   template <class Suspect...>
+      if token in ('>', '=', '[', ']', '.'): return True
+
+      # Check if token is an unmatched '<'.
+      # If not, move on to the next character.
+      if token != '<':
+        pos += 1
+        if pos >= len(line):
+          linenum += 1
+          pos = 0
+        continue
+
+      # We can't be sure if we just find a single '<', and need to
+      # find the matching '>'.
+      (_, end_line, end_pos) = CloseExpression(clean_lines, linenum, pos - 1)
+      if end_pos < 0:
+        # Not sure if template argument list or syntax error in file
+        return False
+      linenum = end_line
+      pos = end_pos
+    return False
+
   def UpdatePreprocessor(self, line):
     """Update preprocessor stack.
 
@@ -1890,6 +2578,7 @@
         # TODO(unknown): unexpected #endif, issue warning?
         pass
 
+  # TODO(unknown): Update() is too long, but we will refactor later.
   def Update(self, filename, clean_lines, linenum, error):
     """Update nesting state with current line.
 
@@ -1901,7 +2590,17 @@
     """
     line = clean_lines.elided[linenum]
 
-    # Update pp_stack first
+    # Remember top of the previous nesting stack.
+    #
+    # The stack is always pushed/popped and not modified in place, so
+    # we can just do a shallow copy instead of copy.deepcopy.  Using
+    # deepcopy would slow down cpplint by ~28%.
+    if self.stack:
+      self.previous_stack_top = self.stack[-1]
+    else:
+      self.previous_stack_top = None
+
+    # Update pp_stack
     self.UpdatePreprocessor(line)
 
     # Count parentheses.  This is to avoid adding struct arguments to
@@ -1952,32 +2651,27 @@
     # such as in:
     #   class LOCKABLE API Object {
     #   };
-    #
-    # Templates with class arguments may confuse the parser, for example:
-    #   template <class T
-    #             class Comparator = less<T>,
-    #             class Vector = vector<T> >
-    #   class HeapQueue {
-    #
-    # Because this parser has no nesting state about templates, by the
-    # time it saw "class Comparator", it may think that it's a new class.
-    # Nested templates have a similar problem:
-    #   template <
-    #       typename ExportedType,
-    #       typename TupleType,
-    #       template <typename, typename> class ImplTemplate>
-    #
-    # To avoid these cases, we ignore classes that are followed by '=' or '>'
     class_decl_match = Match(
-        r'\s*(template\s*<[\w\s<>,:]*>\s*)?'
-        r'(class|struct)\s+([A-Z_]+\s+)*(\w+(?:::\w+)*)'
-        r'(([^=>]|<[^<>]*>|<[^<>]*<[^<>]*>\s*>)*)$', line)
+        r'^(\s*(?:template\s*<[\w\s<>,:]*>\s*)?'
+        r'(class|struct)\s+(?:[A-Z_]+\s+)*(\w+(?:::\w+)*))'
+        r'(.*)$', line)
     if (class_decl_match and
         (not self.stack or self.stack[-1].open_parentheses == 0)):
-      self.stack.append(_ClassInfo(
-          class_decl_match.group(4), class_decl_match.group(2),
-          clean_lines, linenum))
-      line = class_decl_match.group(5)
+      # We do not want to accept classes that are actually template arguments:
+      #   template <class Ignore1,
+      #             class Ignore2 = Default<Args>,
+      #             template <Args> class Ignore3>
+      #   void Function() {};
+      #
+      # To avoid template argument cases, we scan forward and look for
+      # an unmatched '>'.  If we see one, assume we are inside a
+      # template argument list.
+      end_declaration = len(class_decl_match.group(1))
+      if not self.InTemplateArgumentList(clean_lines, linenum, end_declaration):
+        self.stack.append(_ClassInfo(
+            class_decl_match.group(3), class_decl_match.group(2),
+            clean_lines, linenum))
+        line = class_decl_match.group(4)
 
     # If we have not yet seen the opening brace for the innermost block,
     # run checks here.
@@ -2024,10 +2718,13 @@
         # stack otherwise.
         if not self.SeenOpenBrace():
           self.stack[-1].seen_open_brace = True
+        elif Match(r'^extern\s*"[^"]*"\s*\{', line):
+          self.stack.append(_ExternCInfo(linenum))
         else:
-          self.stack.append(_BlockInfo(True))
+          self.stack.append(_BlockInfo(linenum, True))
           if _MATCH_ASM.match(line):
             self.stack[-1].inline_asm = _BLOCK_ASM
+
       elif token == ';' or token == ')':
         # If we haven't seen an opening brace yet, but we already saw
         # a semicolon, this is probably a forward declaration.  Pop
@@ -2103,7 +2800,7 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -2136,7 +2833,8 @@
             r'\s+(register|static|extern|typedef)\b',
             line):
     error(filename, linenum, 'build/storage_class', 5,
-          'Storage class (static, extern, typedef, etc) should be first.')
+          'Storage-class specifier (static, extern, typedef, etc) should be '
+          'at the beginning of the declaration.')
 
   if Match(r'\s*#\s*endif\s*[^/\s]+', line):
     error(filename, linenum, 'build/endif_comment', 5,
@@ -2176,26 +2874,79 @@
 
   # Look for single-argument constructors that aren't marked explicit.
   # Technically a valid construct, but against style.
-  args = Match(r'\s+(?:inline\s+)?%s\s*\(([^,()]+)\)'
-               % re.escape(base_classname),
-               line)
-  if (args and
-      args.group(1) != 'void' and
-      not Match(r'(const\s+)?%s(\s+const)?\s*(?:<\w+>\s*)?&'
-                % re.escape(base_classname), args.group(1).strip())):
-    error(filename, linenum, 'runtime/explicit', 5,
-          'Single-argument constructors should be marked explicit.')
+  explicit_constructor_match = Match(
+      r'\s+(?:(?:inline|constexpr)\s+)*(explicit\s+)?'
+      r'(?:(?:inline|constexpr)\s+)*%s\s*'
+      r'\(((?:[^()]|\([^()]*\))*)\)'
+      % re.escape(base_classname),
+      line)
+
+  if explicit_constructor_match:
+    is_marked_explicit = explicit_constructor_match.group(1)
+
+    if not explicit_constructor_match.group(2):
+      constructor_args = []
+    else:
+      constructor_args = explicit_constructor_match.group(2).split(',')
+
+    # collapse arguments so that commas in template parameter lists and function
+    # argument parameter lists don't split arguments in two
+    i = 0
+    while i < len(constructor_args):
+      constructor_arg = constructor_args[i]
+      while (constructor_arg.count('<') > constructor_arg.count('>') or
+             constructor_arg.count('(') > constructor_arg.count(')')):
+        constructor_arg += ',' + constructor_args[i + 1]
+        del constructor_args[i + 1]
+      constructor_args[i] = constructor_arg
+      i += 1
+
+    defaulted_args = [arg for arg in constructor_args if '=' in arg]
+    noarg_constructor = (not constructor_args or  # empty arg list
+                         # 'void' arg specifier
+                         (len(constructor_args) == 1 and
+                          constructor_args[0].strip() == 'void'))
+    onearg_constructor = ((len(constructor_args) == 1 and  # exactly one arg
+                           not noarg_constructor) or
+                          # all but at most one arg defaulted
+                          (len(constructor_args) >= 1 and
+                           not noarg_constructor and
+                           len(defaulted_args) >= len(constructor_args) - 1))
+    initializer_list_constructor = bool(
+        onearg_constructor and
+        Search(r'\bstd\s*::\s*initializer_list\b', constructor_args[0]))
+    copy_constructor = bool(
+        onearg_constructor and
+        Match(r'(const\s+)?%s(\s*<[^>]*>)?(\s+const)?\s*(?:<\w+>\s*)?&'
+              % re.escape(base_classname), constructor_args[0].strip()))
+
+    if (not is_marked_explicit and
+        onearg_constructor and
+        not initializer_list_constructor and
+        not copy_constructor):
+      if defaulted_args:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Constructors callable with one argument '
+              'should be marked explicit.')
+      else:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Single-parameter constructors should be marked explicit.')
+    elif is_marked_explicit and not onearg_constructor:
+      if noarg_constructor:
+        error(filename, linenum, 'runtime/explicit', 5,
+              'Zero-parameter constructors should not be marked explicit.')
 
 
-def CheckSpacingForFunctionCall(filename, line, linenum, error):
+def CheckSpacingForFunctionCall(filename, clean_lines, linenum, error):
   """Checks for the correctness of various spacing around function calls.
 
   Args:
     filename: The name of the current file.
-    line: The text of the line to check.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     error: The function to call with any errors found.
   """
+  line = clean_lines.elided[linenum]
 
   # Since function calls often occur inside if/for/while/switch
   # expressions - which have their own, more liberal conventions - we
@@ -2238,10 +2989,18 @@
       error(filename, linenum, 'whitespace/parens', 2,
             'Extra space after (')
     if (Search(r'\w\s+\(', fncall) and
-        not Search(r'#\s*define|typedef', fncall) and
-        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall)):
-      error(filename, linenum, 'whitespace/parens', 4,
-            'Extra space before ( in function call')
+        not Search(r'_{0,2}asm_{0,2}\s+_{0,2}volatile_{0,2}\s+\(', fncall) and
+        not Search(r'#\s*define|typedef|using\s+\w+\s*=', fncall) and
+        not Search(r'\w\s+\((\w+::)*\*\w+\)\(', fncall) and
+        not Search(r'\bcase\s+\(', fncall)):
+      # TODO(unknown): Space after an operator function seem to be a common
+      # error, silence those for now by restricting them to highest verbosity.
+      if Search(r'\boperator_*\b', line):
+        error(filename, linenum, 'whitespace/parens', 0,
+              'Extra space before ( in function call')
+      else:
+        error(filename, linenum, 'whitespace/parens', 4,
+              'Extra space before ( in function call')
     # If the ) is followed only by a newline or a { + newline, assume it's
     # part of a control statement (if/while/etc), and don't complain
     if Search(r'[^)]\s+\)\s*[^{\s]', fncall):
@@ -2270,12 +3029,26 @@
   return not line or line.isspace()
 
 
+def CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                                 error):
+  is_namespace_indent_item = (
+      len(nesting_state.stack) > 1 and
+      nesting_state.stack[-1].check_namespace_indentation and
+      isinstance(nesting_state.previous_stack_top, _NamespaceInfo) and
+      nesting_state.previous_stack_top == nesting_state.stack[-2])
+
+  if ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                     clean_lines.elided, line):
+    CheckItemIndentationInNamespace(filename, clean_lines.elided,
+                                    line, error)
+
+
 def CheckForFunctionLengths(filename, clean_lines, linenum,
                             function_state, error):
   """Reports for long function bodies.
 
   For an overview why this is done, see:
-  http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
+  https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Write_Short_Functions
 
   Uses a simplistic algorithm assuming other style guidelines
   (especially spacing) are followed.
@@ -2295,8 +3068,6 @@
   """
   lines = clean_lines.lines
   line = lines[linenum]
-  raw = clean_lines.raw_lines
-  raw_line = raw[linenum]
   joined_line = ''
 
   starting_func = False
@@ -2343,190 +3114,58 @@
 _RE_PATTERN_TODO = re.compile(r'^//(\s*)TODO(\(.+?\))?:?(\s|$)?')
 
 
-def CheckComment(comment, filename, linenum, error):
-  """Checks for common mistakes in TODO comments.
+def CheckComment(line, filename, linenum, next_line_start, error):
+  """Checks for common mistakes in comments.
 
   Args:
-    comment: The text of the comment from the line in question.
+    line: The line in question.
     filename: The name of the current file.
     linenum: The number of the line to check.
+    next_line_start: The first non-whitespace column of the next line.
     error: The function to call with any errors found.
   """
-  match = _RE_PATTERN_TODO.match(comment)
-  if match:
-    # One whitespace is correct; zero whitespace is handled elsewhere.
-    leading_whitespace = match.group(1)
-    if len(leading_whitespace) > 1:
-      error(filename, linenum, 'whitespace/todo', 2,
-            'Too many spaces before TODO')
+  commentpos = line.find('//')
+  if commentpos != -1:
+    # Check if the // may be in quotes.  If so, ignore it
+    if re.sub(r'\\.', '', line[0:commentpos]).count('"') % 2 == 0:
+      # Allow one space for new scopes, two spaces otherwise:
+      if (not (Match(r'^.*{ *//', line) and next_line_start == commentpos) and
+          ((commentpos >= 1 and
+            line[commentpos-1] not in string.whitespace) or
+           (commentpos >= 2 and
+            line[commentpos-2] not in string.whitespace))):
+        error(filename, linenum, 'whitespace/comments', 2,
+              'At least two spaces is best between code and comments')
 
-    username = match.group(2)
-    if not username:
-      error(filename, linenum, 'readability/todo', 2,
-            'Missing username in TODO; it should look like '
-            '"// TODO(my_username): Stuff."')
+      # Checks for common mistakes in TODO comments.
+      comment = line[commentpos:]
+      match = _RE_PATTERN_TODO.match(comment)
+      if match:
+        # One whitespace is correct; zero whitespace is handled elsewhere.
+        leading_whitespace = match.group(1)
+        if len(leading_whitespace) > 1:
+          error(filename, linenum, 'whitespace/todo', 2,
+                'Too many spaces before TODO')
 
-    middle_whitespace = match.group(3)
-    # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
-    if middle_whitespace != ' ' and middle_whitespace != '':
-      error(filename, linenum, 'whitespace/todo', 2,
-            'TODO(my_username) should be followed by a space')
+        username = match.group(2)
+        if not username:
+          error(filename, linenum, 'readability/todo', 2,
+                'Missing username in TODO; it should look like '
+                '"// TODO(my_username): Stuff."')
 
-def CheckAccess(filename, clean_lines, linenum, nesting_state, error):
-  """Checks for improper use of DISALLOW* macros.
+        middle_whitespace = match.group(3)
+        # Comparisons made explicit for correctness -- pylint: disable=g-explicit-bool-comparison
+        if middle_whitespace != ' ' and middle_whitespace != '':
+          error(filename, linenum, 'whitespace/todo', 2,
+                'TODO(my_username) should be followed by a space')
 
-  Args:
-    filename: The name of the current file.
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
-                   the current stack of nested blocks being parsed.
-    error: The function to call with any errors found.
-  """
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
-
-  matched = Match((r'\s*(DISALLOW_COPY_AND_ASSIGN|'
-                   r'DISALLOW_EVIL_CONSTRUCTORS|'
-                   r'DISALLOW_IMPLICIT_CONSTRUCTORS)'), line)
-  if not matched:
-    return
-  if nesting_state.stack and isinstance(nesting_state.stack[-1], _ClassInfo):
-    if nesting_state.stack[-1].access != 'private':
-      error(filename, linenum, 'readability/constructors', 3,
-            '%s must be in the private: section' % matched.group(1))
-
-  else:
-    # Found DISALLOW* macro outside a class declaration, or perhaps it
-    # was used inside a function when it should have been part of the
-    # class declaration.  We could issue a warning here, but it
-    # probably resulted in a compiler error already.
-    pass
-
-
-def FindNextMatchingAngleBracket(clean_lines, linenum, init_suffix):
-  """Find the corresponding > to close a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_suffix: Remainder of the current line after the initial <.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_suffix
-  nesting_stack = ['<']
-  while True:
-    # Find the next operator that can tell us whether < is used as an
-    # opening bracket or as a less-than operator.  We only want to
-    # warn on the latter case.
-    #
-    # We could also check all other operators and terminate the search
-    # early, e.g. if we got something like this "a<b+c", the "<" is
-    # most likely a less-than operator, but then we will get false
-    # positives for default arguments and other template expressions.
-    match = Search(r'^[^<>(),;\[\]]*([<>(),;\[\]])(.*)$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(1)
-      line = match.group(2)
-
-      if nesting_stack[-1] == '<':
-        # Expecting closing angle bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator == '>':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma after a bracket, this is most likely a template
-          # argument.  We have not seen a closing angle bracket yet, but
-          # it's probably a few lines later if we look for it, so just
-          # return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting closing parenthesis or closing bracket
-        if operator in ('<', '(', '['):
-          nesting_stack.append(operator)
-        elif operator in (')', ']'):
-          # We don't bother checking for matching () or [].  If we got
-          # something like (] or [), it would have been a syntax error.
-          nesting_stack.pop()
-
-    else:
-      # Scan the next line
-      linenum += 1
-      if linenum >= len(clean_lines.elided):
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all remaining lines and still no matching angle bracket.
-  # Most likely the input was incomplete, otherwise we should have
-  # seen a semicolon and returned early.
-  return True
-
-
-def FindPreviousMatchingAngleBracket(clean_lines, linenum, init_prefix):
-  """Find the corresponding < that started a template.
-
-  Args:
-    clean_lines: A CleansedLines instance containing the file.
-    linenum: Current line number.
-    init_prefix: Part of the current line before the initial >.
-
-  Returns:
-    True if a matching bracket exists.
-  """
-  line = init_prefix
-  nesting_stack = ['>']
-  while True:
-    # Find the previous operator
-    match = Search(r'^(.*)([<>(),;\[\]])[^<>(),;\[\]]*$', line)
-    if match:
-      # Found an operator, update nesting stack
-      operator = match.group(2)
-      line = match.group(1)
-
-      if nesting_stack[-1] == '>':
-        # Expecting opening angle bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator == '<':
-          nesting_stack.pop()
-          if not nesting_stack:
-            # Found matching angle bracket
-            return True
-        elif operator == ',':
-          # Got a comma before a bracket, this is most likely a
-          # template argument.  The opening angle bracket is probably
-          # there if we look for it, so just return early here.
-          return True
-        else:
-          # Got some other operator.
-          return False
-
-      else:
-        # Expecting opening parenthesis or opening bracket
-        if operator in ('>', ')', ']'):
-          nesting_stack.append(operator)
-        elif operator in ('(', '['):
-          nesting_stack.pop()
-
-    else:
-      # Scan the previous line
-      linenum -= 1
-      if linenum < 0:
-        break
-      line = clean_lines.elided[linenum]
-
-  # Exhausted all earlier lines and still no matching angle bracket.
-  return False
+      # If the comment contains an alphanumeric character, there
+      # should be a space somewhere between it and the // unless
+      # it's a /// or //! Doxygen comment.
+      if (Match(r'//[^ ]*\w', comment) and
+          not Match(r'(///|//\!)(\s+|$)', comment)):
+        error(filename, linenum, 'whitespace/comments', 4,
+              'Should have a space between // and comment')
 
 
 def CheckSpacing(filename, clean_lines, linenum, nesting_state, error):
@@ -2542,7 +3181,7 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -2565,7 +3204,12 @@
   #   }
   #
   # A warning about missing end of namespace comments will be issued instead.
-  if IsBlankLine(line) and not nesting_state.InNamespaceBody():
+  #
+  # Also skip blank line checks for 'extern "C"' blocks, which are formatted
+  # like namespaces.
+  if (IsBlankLine(line) and
+      not nesting_state.InNamespaceBody() and
+      not nesting_state.InExternC()):
     elided = clean_lines.elided
     prev_line = elided[linenum - 1]
     prevbrace = prev_line.rfind('{')
@@ -2628,54 +3272,64 @@
       error(filename, linenum, 'whitespace/blank_line', 3,
             'Do not leave a blank line after "%s:"' % matched.group(1))
 
-  # Next, we complain if there's a comment too near the text
-  commentpos = line.find('//')
-  if commentpos != -1:
-    # Check if the // may be in quotes.  If so, ignore it
-    # Comparisons made explicit for clarity -- pylint: disable=g-explicit-bool-comparison
-    if (line.count('"', 0, commentpos) -
-        line.count('\\"', 0, commentpos)) % 2 == 0:   # not in quotes
-      # Allow one space for new scopes, two spaces otherwise:
-      if (not Match(r'^\s*{ //', line) and
-          ((commentpos >= 1 and
-            line[commentpos-1] not in string.whitespace) or
-           (commentpos >= 2 and
-            line[commentpos-2] not in string.whitespace))):
-        error(filename, linenum, 'whitespace/comments', 2,
-              'At least two spaces is best between code and comments')
-      # There should always be a space between the // and the comment
-      commentend = commentpos + 2
-      if commentend < len(line) and not line[commentend] == ' ':
-        # but some lines are exceptions -- e.g. if they're big
-        # comment delimiters like:
-        # //----------------------------------------------------------
-        # or are an empty C++ style Doxygen comment, like:
-        # ///
-        # or C++ style Doxygen comments placed after the variable:
-        # ///<  Header comment
-        # //!<  Header comment
-        # or they begin with multiple slashes followed by a space:
-        # //////// Header comment
-        match = (Search(r'[=/-]{4,}\s*$', line[commentend:]) or
-                 Search(r'^/$', line[commentend:]) or
-                 Search(r'^!< ', line[commentend:]) or
-                 Search(r'^/< ', line[commentend:]) or
-                 Search(r'^/+ ', line[commentend:]))
-        if not match:
-          error(filename, linenum, 'whitespace/comments', 4,
-                'Should have a space between // and comment')
-      CheckComment(line[commentpos:], filename, linenum, error)
+  # Next, check comments
+  next_line_start = 0
+  if linenum + 1 < clean_lines.NumLines():
+    next_line = raw[linenum + 1]
+    next_line_start = len(next_line) - len(next_line.lstrip())
+  CheckComment(line, filename, linenum, next_line_start, error)
 
-  line = clean_lines.elided[linenum]  # get rid of comments and strings
+  # get rid of comments and strings
+  line = clean_lines.elided[linenum]
 
-  # Don't try to do spacing checks for operator methods
-  line = re.sub(r'operator(==|!=|<|<<|<=|>=|>>|>)\(', 'operator\(', line)
+  # You shouldn't have spaces before your brackets, except maybe after
+  # 'delete []', 'return []() {};', or 'auto [abc, ...] = ...;'.
+  if Search(r'\w\s+\[', line) and not Search(r'(?:auto&?|delete|return)\s+\[', line):
+    error(filename, linenum, 'whitespace/braces', 5,
+          'Extra space before [')
+
+  # In range-based for, we wanted spaces before and after the colon, but
+  # not around "::" tokens that might appear.
+  if (Search(r'for *\(.*[^:]:[^: ]', line) or
+      Search(r'for *\(.*[^: ]:[^:]', line)):
+    error(filename, linenum, 'whitespace/forcolon', 2,
+          'Missing space around colon in range-based for loop')
+
+
+def CheckOperatorSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around operators.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Don't try to do spacing checks for operator methods.  Do this by
+  # replacing the troublesome characters with something else,
+  # preserving column position for all other characters.
+  #
+  # The replacement is done repeatedly to avoid false positives from
+  # operators that call operators.
+  while True:
+    match = Match(r'^(.*\boperator\b)(\S+)(\s*\(.*)$', line)
+    if match:
+      line = match.group(1) + ('_' * len(match.group(2))) + match.group(3)
+    else:
+      break
 
   # We allow no-spaces around = within an if: "if ( (a=Foo()) == 0 )".
   # Otherwise not.  Note we only check for non-spaces on *both* sides;
   # sometimes people put non-spaces on one side when aligning ='s among
   # many lines (not that this is behavior that I approve of...)
-  if Search(r'[\w.]=[\w.]', line) and not Search(r'\b(if|while) ', line):
+  if ((Search(r'[\w.]=', line) or
+       Search(r'=[\w.]', line))
+      and not Search(r'\b(if|while|for) ', line)
+      # Operators taken from [lex.operators] in C++11 standard.
+      and not Search(r'(>=|<=|==|!=|&=|\^=|\|=|\+=|\*=|\/=|\%=)', line)
+      and not Search(r'operator=', line)):
     error(filename, linenum, 'whitespace/operators', 4,
           'Missing spaces around =')
 
@@ -2687,42 +3341,51 @@
   #
   # Check <= and >= first to avoid false positives with < and >, then
   # check non-include lines for spacing around < and >.
-  match = Search(r'[^<>=!\s](==|!=|<=|>=)[^<>=!\s]', line)
+  #
+  # If the operator is followed by a comma, assume it's be used in a
+  # macro context and don't do any checks.  This avoids false
+  # positives.
+  #
+  # Note that && is not included here.  This is because there are too
+  # many false positives due to RValue references.
+  match = Search(r'[^<>=!\s](==|!=|<=|>=|\|\|)[^<>=!\s,;\)]', line)
   if match:
     error(filename, linenum, 'whitespace/operators', 3,
           'Missing spaces around %s' % match.group(1))
-  # We allow no-spaces around << when used like this: 10<<20, but
-  # not otherwise (particularly, not when used as streams)
-  # Also ignore using ns::operator<<;
-  match = Search(r'(operator|\S)(?:L|UL|ULL|l|ul|ull)?<<(\S)', line)
-  if (match and
-      not (match.group(1).isdigit() and match.group(2).isdigit()) and
-      not (match.group(1) == 'operator' and match.group(2) == ';')):
-    error(filename, linenum, 'whitespace/operators', 3,
-          'Missing spaces around <<')
   elif not Match(r'#.*include', line):
-    # Avoid false positives on ->
-    reduced_line = line.replace('->', '')
-
     # Look for < that is not surrounded by spaces.  This is only
     # triggered if both sides are missing spaces, even though
     # technically should should flag if at least one side is missing a
     # space.  This is done to avoid some false positives with shifts.
-    match = Search(r'[^\s<]<([^\s=<].*)', reduced_line)
-    if (match and
-        not FindNextMatchingAngleBracket(clean_lines, linenum, match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around <')
+    match = Match(r'^(.*[^\s<])<[^\s=<,]', line)
+    if match:
+      (_, _, end_pos) = CloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if end_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around <')
 
     # Look for > that is not surrounded by spaces.  Similar to the
     # above, we only trigger if both sides are missing spaces to avoid
     # false positives with shifts.
-    match = Search(r'^(.*[^\s>])>[^\s=>]', reduced_line)
-    if (match and
-        not FindPreviousMatchingAngleBracket(clean_lines, linenum,
-                                             match.group(1))):
-      error(filename, linenum, 'whitespace/operators', 3,
-            'Missing spaces around >')
+    match = Match(r'^(.*[^-\s>])>[^\s=>,]', line)
+    if match:
+      (_, _, start_pos) = ReverseCloseExpression(
+          clean_lines, linenum, len(match.group(1)))
+      if start_pos <= -1:
+        error(filename, linenum, 'whitespace/operators', 3,
+              'Missing spaces around >')
+
+  # We allow no-spaces around << when used like this: 10<<20, but
+  # not otherwise (particularly, not when used as streams)
+  #
+  # We also allow operators following an opening parenthesis, since
+  # those tend to be macros that deal with operators.
+  match = Search(r'(operator|[^\s(<])(?:L|UL|LL|ULL|l|ul|ll|ull)?<<([^\s,=<])', line)
+  if (match and not (match.group(1).isdigit() and match.group(2).isdigit()) and
+      not (match.group(1) == 'operator' and match.group(2) == ';')):
+    error(filename, linenum, 'whitespace/operators', 3,
+          'Missing spaces around <<')
 
   # We allow no-spaces around >> for almost anything.  This is because
   # C++11 allows ">>" to close nested templates, which accounts for
@@ -2747,7 +3410,19 @@
     error(filename, linenum, 'whitespace/operators', 4,
           'Extra space for operator %s' % match.group(1))
 
-  # A pet peeve of mine: no spaces after an if, while, switch, or for
+
+def CheckParenthesisSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing around parentheses.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # No spaces after an if, while, switch, or for
   match = Search(r' (if\(|for\(|while\(|switch\()', line)
   if match:
     error(filename, linenum, 'whitespace/parens', 5,
@@ -2773,6 +3448,19 @@
             'Should have zero or one spaces inside ( and ) in %s' %
             match.group(1))
 
+
+def CheckCommaSpacing(filename, clean_lines, linenum, error):
+  """Checks for horizontal spacing near commas and semicolons.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  raw = clean_lines.lines_without_raw_strings
+  line = clean_lines.elided[linenum]
+
   # You should always have a space after a comma (either as fn arg or operator)
   #
   # This does not apply when the non-space character following the
@@ -2783,7 +3471,8 @@
   # verify that lines contain missing whitespaces, second pass on raw
   # lines to confirm that those missing whitespaces are not due to
   # elided comments.
-  if Search(r',[^,\s]', line) and Search(r',[^,\s]', raw[linenum]):
+  if (Search(r',[^,\s]', ReplaceAll(r'\boperator\s*,\s*\(', 'F(', line)) and
+      Search(r',[^,\s]', raw[linenum])):
     error(filename, linenum, 'whitespace/comma', 3,
           'Missing space after ,')
 
@@ -2795,14 +3484,91 @@
     error(filename, linenum, 'whitespace/semicolon', 3,
           'Missing space after ;')
 
-  # Next we will look for issues with function calls.
-  CheckSpacingForFunctionCall(filename, line, linenum, error)
+
+def _IsType(clean_lines, nesting_state, expr):
+  """Check if expression looks like a type name, returns true if so.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    expr: The expression to check.
+  Returns:
+    True, if token looks like a type.
+  """
+  # Keep only the last token in the expression
+  last_word = Match(r'^.*(\b\S+)$', expr)
+  if last_word:
+    token = last_word.group(1)
+  else:
+    token = expr
+
+  # Match native types and stdint types
+  if _TYPES.match(token):
+    return True
+
+  # Try a bit harder to match templated types.  Walk up the nesting
+  # stack until we find something that resembles a typename
+  # declaration for what we are looking for.
+  typename_pattern = (r'\b(?:typename|class|struct)\s+' + re.escape(token) +
+                      r'\b')
+  block_index = len(nesting_state.stack) - 1
+  while block_index >= 0:
+    if isinstance(nesting_state.stack[block_index], _NamespaceInfo):
+      return False
+
+    # Found where the opening brace is.  We want to scan from this
+    # line up to the beginning of the function, minus a few lines.
+    #   template <typename Type1,  // stop scanning here
+    #             ...>
+    #   class C
+    #     : public ... {  // start scanning here
+    last_line = nesting_state.stack[block_index].starting_linenum
+
+    next_block_start = 0
+    if block_index > 0:
+      next_block_start = nesting_state.stack[block_index - 1].starting_linenum
+    first_line = last_line
+    while first_line >= next_block_start:
+      if clean_lines.elided[first_line].find('template') >= 0:
+        break
+      first_line -= 1
+    if first_line < next_block_start:
+      # Didn't find any "template" keyword before reaching the next block,
+      # there are probably no template things to check for this block
+      block_index -= 1
+      continue
+
+    # Look for typename in the specified range
+    for i in xrange(first_line, last_line + 1, 1):
+      if Search(typename_pattern, clean_lines.elided[i]):
+        return True
+    block_index -= 1
+
+  return False
+
+
+def CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error):
+  """Checks for horizontal spacing near commas.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    nesting_state: A NestingState instance which maintains information about
+                   the current stack of nested blocks being parsed.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
 
   # Except after an opening paren, or after another opening brace (in case of
   # an initializer list, for instance), you should have spaces before your
-  # braces. And since you should never have braces at the beginning of a line,
-  # this is an easy test.
-  match = Match(r'^(.*[^ ({]){', line)
+  # braces when they are delimiting blocks, classes, namespaces etc.
+  # And since you should never have braces at the beginning of a line,
+  # this is an easy test.  Except that braces used for initialization don't
+  # follow the same rule; we often don't want spaces before those.
+  match = Match(r'^(.*[^ ({>]){', line)
+
   if match:
     # Try a bit harder to check for brace initialization.  This
     # happens in one of the following forms:
@@ -2813,10 +3579,12 @@
     #   LastArgument(..., type{});
     #   LOG(INFO) << type{} << " ...";
     #   map_of_type[{...}] = ...;
+    #   ternary = expr ? new type{} : nullptr;
+    #   OuterTemplate<InnerTemplateConstructor<Type>{}>
     #
     # We check for the character following the closing brace, and
     # silence the warning if it's one of those listed above, i.e.
-    # "{.;,)<]".
+    # "{.;,)<>]:".
     #
     # To account for nested initializer list, we allow any number of
     # closing braces up to "{;,)<".  We can't simply silence the
@@ -2830,6 +3598,7 @@
     # There is a false negative with this approach if people inserted
     # spurious semicolons, e.g. "if (cond){};", but we will catch the
     # spurious semicolon with a separate check.
+    leading_text = match.group(1)
     (endline, endlinenum, endpos) = CloseExpression(
         clean_lines, linenum, len(match.group(1)))
     trailing_text = ''
@@ -2838,7 +3607,11 @@
     for offset in xrange(endlinenum + 1,
                          min(endlinenum + 3, clean_lines.NumLines() - 1)):
       trailing_text += clean_lines.elided[offset]
-    if not Match(r'^[\s}]*[{.;,)<\]]', trailing_text):
+    # We also suppress warnings for `uint64_t{expression}` etc., as the style
+    # guide recommends brace initialization for integral types to avoid
+    # overflow/truncation.
+    if (not Match(r'^[\s}]*[{.;,)<>\]:]', trailing_text)
+        and not _IsType(clean_lines, nesting_state, leading_text)):
       error(filename, linenum, 'whitespace/braces', 5,
             'Missing space before {')
 
@@ -2847,12 +3620,6 @@
     error(filename, linenum, 'whitespace/braces', 5,
           'Missing space before else')
 
-  # You shouldn't have spaces before your brackets, except maybe after
-  # 'delete []' or 'new char * []'.
-  if Search(r'\w\s+\[', line) and not Search(r'delete\s+\[', line):
-    error(filename, linenum, 'whitespace/braces', 5,
-          'Extra space before [')
-
   # You shouldn't have a space before a semicolon at the end of the line.
   # There's a special case for "for" since the style guide allows space before
   # the semicolon there.
@@ -2869,12 +3636,23 @@
           'Extra space before last semicolon. If this should be an empty '
           'statement, use {} instead.')
 
-  # In range-based for, we wanted spaces before and after the colon, but
-  # not around "::" tokens that might appear.
-  if (Search('for *\(.*[^:]:[^: ]', line) or
-      Search('for *\(.*[^: ]:[^:]', line)):
-    error(filename, linenum, 'whitespace/forcolon', 2,
-          'Missing space around colon in range-based for loop')
+
+def IsDecltype(clean_lines, linenum, column):
+  """Check if the token ending on (linenum, column) is decltype().
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: the number of the line to check.
+    column: end column of the token to check.
+  Returns:
+    True if this token is decltype() expression, False otherwise.
+  """
+  (text, _, start_col) = ReverseCloseExpression(clean_lines, linenum, column)
+  if start_col < 0:
+    return False
+  if Search(r'\bdecltype\s*$', text[0:start_col]):
+    return True
+  return False
 
 
 def CheckSectionSpacing(filename, clean_lines, class_info, linenum, error):
@@ -2974,15 +3752,18 @@
     # used for brace initializers inside function calls.  We don't detect this
     # perfectly: we just don't complain if the last non-whitespace character on
     # the previous non-blank line is ',', ';', ':', '(', '{', or '}', or if the
-    # previous line starts a preprocessor block.
+    # previous line starts a preprocessor block. We also allow a brace on the
+    # following line if it is part of an array initialization and would not fit
+    # within the 80 character limit of the preceding line.
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if (not Search(r'[,;:}{(]\s*$', prevline) and
-        not Match(r'\s*#', prevline)):
+        not Match(r'\s*#', prevline) and
+        not (GetLineWidth(prevline) > _line_length - 2 and '[]' in prevline)):
       error(filename, linenum, 'whitespace/braces', 4,
             '{ should almost always be at the end of the previous line')
 
   # An else clause should be on the same line as the preceding closing brace.
-  if Match(r'\s*else\s*', line):
+  if Match(r'\s*else\b\s*(?:if\b|\{|$)', line):
     prevline = GetPreviousNonBlankLine(clean_lines, linenum)[0]
     if Match(r'\s*}\s*$', prevline):
       error(filename, linenum, 'whitespace/newline', 4,
@@ -2990,19 +3771,20 @@
 
   # If braces come on one side of an else, they should be on both.
   # However, we have to worry about "else if" that spans multiple lines!
-  if Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
-    if Search(r'}\s*else if([^{]*)$', line):       # could be multi-line if
-      # find the ( after the if
-      pos = line.find('else if')
-      pos = line.find('(', pos)
-      if pos > 0:
-        (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
-        if endline[endpos:].find('{') == -1:    # must be brace after if
-          error(filename, linenum, 'readability/braces', 5,
-                'If an else has a brace on one side, it should have it on both')
-    else:            # common case: else not followed by a multi-line if
-      error(filename, linenum, 'readability/braces', 5,
-            'If an else has a brace on one side, it should have it on both')
+  if Search(r'else if\s*\(', line):       # could be multi-line if
+    brace_on_left = bool(Search(r'}\s*else if\s*\(', line))
+    # find the ( after the if
+    pos = line.find('else if')
+    pos = line.find('(', pos)
+    if pos > 0:
+      (endline, _, endpos) = CloseExpression(clean_lines, linenum, pos)
+      brace_on_right = endline[endpos:].find('{') != -1
+      if brace_on_left != brace_on_right:    # must be brace after if
+        error(filename, linenum, 'readability/braces', 5,
+              'If an else has a brace on one side, it should have it on both')
+  elif Search(r'}\s*else[^{]*$', line) or Match(r'[^}]*else\s*{', line):
+    error(filename, linenum, 'readability/braces', 5,
+          'If an else has a brace on one side, it should have it on both')
 
   # Likewise, an else should never have the else clause on the same line
   if Search(r'\belse [^\s{]', line) and not Search(r'\belse if\b', line):
@@ -3014,11 +3796,79 @@
     error(filename, linenum, 'whitespace/newline', 4,
           'do/while clauses should not be on a single line')
 
+  # Check single-line if/else bodies. The style guide says 'curly braces are not
+  # required for single-line statements'. We additionally allow multi-line,
+  # single statements, but we reject anything with more than one semicolon in
+  # it. This means that the first semicolon after the if should be at the end of
+  # its line, and the line after that should have an indent level equal to or
+  # lower than the if. We also check for ambiguous if/else nesting without
+  # braces.
+  if_else_match = Search(r'\b(if\s*\(|else\b)', line)
+  if if_else_match and not Match(r'\s*#', line):
+    if_indent = GetIndentLevel(line)
+    endline, endlinenum, endpos = line, linenum, if_else_match.end()
+    if_match = Search(r'\bif\s*\(', line)
+    if if_match:
+      # This could be a multiline if condition, so find the end first.
+      pos = if_match.end() - 1
+      (endline, endlinenum, endpos) = CloseExpression(clean_lines, linenum, pos)
+    # Check for an opening brace, either directly after the if or on the next
+    # line. If found, this isn't a single-statement conditional.
+    if (not Match(r'\s*{', endline[endpos:])
+        and not (Match(r'\s*$', endline[endpos:])
+                 and endlinenum < (len(clean_lines.elided) - 1)
+                 and Match(r'\s*{', clean_lines.elided[endlinenum + 1]))):
+      while (endlinenum < len(clean_lines.elided)
+             and ';' not in clean_lines.elided[endlinenum][endpos:]):
+        endlinenum += 1
+        endpos = 0
+      if endlinenum < len(clean_lines.elided):
+        endline = clean_lines.elided[endlinenum]
+        # We allow a mix of whitespace and closing braces (e.g. for one-liner
+        # methods) and a single \ after the semicolon (for macros)
+        endpos = endline.find(';')
+        if not Match(r';[\s}]*(\\?)$', endline[endpos:]):
+          # Semicolon isn't the last character, there's something trailing.
+          # Output a warning if the semicolon is not contained inside
+          # a lambda expression.
+          if not Match(r'^[^{};]*\[[^\[\]]*\][^{}]*\{[^{}]*\}\s*\)*[;,]\s*$',
+                       endline):
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+        elif endlinenum < len(clean_lines.elided) - 1:
+          # Make sure the next line is dedented
+          next_line = clean_lines.elided[endlinenum + 1]
+          next_indent = GetIndentLevel(next_line)
+          # With ambiguous nested if statements, this will error out on the
+          # if that *doesn't* match the else, regardless of whether it's the
+          # inner one or outer one.
+          if (if_match and Match(r'\s*else\b', next_line)
+              and next_indent != if_indent):
+            error(filename, linenum, 'readability/braces', 4,
+                  'Else clause should be indented at the same level as if. '
+                  'Ambiguous nested if/else chains require braces.')
+          elif next_indent > if_indent:
+            error(filename, linenum, 'readability/braces', 4,
+                  'If/else bodies with multiple statements require braces')
+
+
+def CheckTrailingSemicolon(filename, clean_lines, linenum, error):
+  """Looks for redundant trailing semicolon.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+
+  line = clean_lines.elided[linenum]
+
   # Block bodies should not be followed by a semicolon.  Due to C++11
   # brace initialization, there are more places where semicolons are
-  # required than not, so we use a whitelist approach to check these
-  # rather than a blacklist.  These are the places where "};" should
-  # be replaced by just "}":
+  # required than not, so we explicitly list the allowed rules rather
+  # than listing the disallowed ones.  These are the places where "};"
+  # should be replaced by just "}":
   # 1. Some flavor of block following closing parenthesis:
   #    for (;;) {};
   #    while (...) {};
@@ -3074,28 +3924,40 @@
     #  - INTERFACE_DEF
     #  - EXCLUSIVE_LOCKS_REQUIRED, SHARED_LOCKS_REQUIRED, LOCKS_EXCLUDED:
     #
-    # We implement a whitelist of safe macros instead of a blacklist of
+    # We implement a list of safe macros instead of a list of
     # unsafe macros, even though the latter appears less frequently in
     # google code and would have been easier to implement.  This is because
-    # the downside for getting the whitelist wrong means some extra
-    # semicolons, while the downside for getting the blacklist wrong
+    # the downside for getting the allowed checks wrong means some extra
+    # semicolons, while the downside for getting disallowed checks wrong
     # would result in compile errors.
     #
-    # In addition to macros, we also don't want to warn on compound
-    # literals.
+    # In addition to macros, we also don't want to warn on
+    #  - Compound literals
+    #  - Lambdas
+    #  - alignas specifier with anonymous structs
+    #  - decltype
     closing_brace_pos = match.group(1).rfind(')')
     opening_parenthesis = ReverseCloseExpression(
         clean_lines, linenum, closing_brace_pos)
     if opening_parenthesis[2] > -1:
       line_prefix = opening_parenthesis[0][0:opening_parenthesis[2]]
-      macro = Search(r'\b([A-Z_]+)\s*$', line_prefix)
+      macro = Search(r'\b([A-Z_][A-Z0-9_]*)\s*$', line_prefix)
+      func = Match(r'^(.*\])\s*$', line_prefix)
       if ((macro and
            macro.group(1) not in (
                'TEST', 'TEST_F', 'MATCHER', 'MATCHER_P', 'TYPED_TEST',
                'EXCLUSIVE_LOCKS_REQUIRED', 'SHARED_LOCKS_REQUIRED',
                'LOCKS_EXCLUDED', 'INTERFACE_DEF')) or
+          (func and not Search(r'\boperator\s*\[\s*\]', func.group(1))) or
+          Search(r'\b(?:struct|union)\s+alignas\s*$', line_prefix) or
+          Search(r'\bdecltype$', line_prefix) or
           Search(r'\s+=\s*$', line_prefix)):
         match = None
+    if (match and
+        opening_parenthesis[1] > 1 and
+        Search(r'\]\s*$', clean_lines.elided[opening_parenthesis[1] - 1])):
+      # Multi-line lambda-expression
+      match = None
 
   else:
     # Try matching cases 2-3.
@@ -3125,6 +3987,14 @@
       # outputting warnings for the matching closing brace, if there are
       # nested blocks with trailing semicolons, we will get the error
       # messages in reversed order.
+
+      # We need to check the line forward for NOLINT
+      raw_lines = clean_lines.raw_lines
+      ParseNolintSuppressions(filename, raw_lines[endlinenum-1], endlinenum-1,
+                              error)
+      ParseNolintSuppressions(filename, raw_lines[endlinenum], endlinenum,
+                              error)
+
       error(filename, endlinenum, 'readability/braces', 4,
             "You don't need a ; after a }")
 
@@ -3148,7 +4018,7 @@
   line = clean_lines.elided[linenum]
   matched = Match(r'\s*(for|while|if)\s*\(', line)
   if matched:
-    # Find the end of the conditional expression
+    # Find the end of the conditional expression.
     (end_line, end_linenum, end_pos) = CloseExpression(
         clean_lines, linenum, line.find('('))
 
@@ -3163,6 +4033,98 @@
         error(filename, end_linenum, 'whitespace/empty_loop_body', 5,
               'Empty loop bodies should use {} or continue')
 
+    # Check for if statements that have completely empty bodies (no comments)
+    # and no else clauses.
+    if end_pos >= 0 and matched.group(1) == 'if':
+      # Find the position of the opening { for the if statement.
+      # Return without logging an error if it has no brackets.
+      opening_linenum = end_linenum
+      opening_line_fragment = end_line[end_pos:]
+      # Loop until EOF or find anything that's not whitespace or opening {.
+      while not Search(r'^\s*\{', opening_line_fragment):
+        if Search(r'^(?!\s*$)', opening_line_fragment):
+          # Conditional has no brackets.
+          return
+        opening_linenum += 1
+        if opening_linenum == len(clean_lines.elided):
+          # Couldn't find conditional's opening { or any code before EOF.
+          return
+        opening_line_fragment = clean_lines.elided[opening_linenum]
+      # Set opening_line (opening_line_fragment may not be entire opening line).
+      opening_line = clean_lines.elided[opening_linenum]
+
+      # Find the position of the closing }.
+      opening_pos = opening_line_fragment.find('{')
+      if opening_linenum == end_linenum:
+        # We need to make opening_pos relative to the start of the entire line.
+        opening_pos += end_pos
+      (closing_line, closing_linenum, closing_pos) = CloseExpression(
+          clean_lines, opening_linenum, opening_pos)
+      if closing_pos < 0:
+        return
+
+      # Now construct the body of the conditional. This consists of the portion
+      # of the opening line after the {, all lines until the closing line,
+      # and the portion of the closing line before the }.
+      if (clean_lines.raw_lines[opening_linenum] !=
+          CleanseComments(clean_lines.raw_lines[opening_linenum])):
+        # Opening line ends with a comment, so conditional isn't empty.
+        return
+      if closing_linenum > opening_linenum:
+        # Opening line after the {. Ignore comments here since we checked above.
+        body = list(opening_line[opening_pos+1:])
+        # All lines until closing line, excluding closing line, with comments.
+        body.extend(clean_lines.raw_lines[opening_linenum+1:closing_linenum])
+        # Closing line before the }. Won't (and can't) have comments.
+        body.append(clean_lines.elided[closing_linenum][:closing_pos-1])
+        body = '\n'.join(body)
+      else:
+        # If statement has brackets and fits on a single line.
+        body = opening_line[opening_pos+1:closing_pos-1]
+
+      # Check if the body is empty
+      if not _EMPTY_CONDITIONAL_BODY_PATTERN.search(body):
+        return
+      # The body is empty. Now make sure there's not an else clause.
+      current_linenum = closing_linenum
+      current_line_fragment = closing_line[closing_pos:]
+      # Loop until EOF or find anything that's not whitespace or else clause.
+      while Search(r'^\s*$|^(?=\s*else)', current_line_fragment):
+        if Search(r'^(?=\s*else)', current_line_fragment):
+          # Found an else clause, so don't log an error.
+          return
+        current_linenum += 1
+        if current_linenum == len(clean_lines.elided):
+          break
+        current_line_fragment = clean_lines.elided[current_linenum]
+
+      # The body is empty and there's no else clause until EOF or other code.
+      error(filename, end_linenum, 'whitespace/empty_if_body', 4,
+            ('If statement had no body and no else clause'))
+
+
+def FindCheckMacro(line):
+  """Find a replaceable CHECK-like macro.
+
+  Args:
+    line: line to search on.
+  Returns:
+    (macro name, start position), or (None, -1) if no replaceable
+    macro is found.
+  """
+  for macro in _CHECK_MACROS:
+    i = line.find(macro)
+    if i >= 0:
+      # Find opening parenthesis.  Do a regular expression match here
+      # to make sure that we are matching the expected CHECK macro, as
+      # opposed to some other macro that happens to contain the CHECK
+      # substring.
+      matched = Match(r'^(.*\b' + macro + r'\s*)\(', line)
+      if not matched:
+        continue
+      return (macro, len(matched.group(1)))
+  return (None, -1)
+
 
 def CheckCheck(filename, clean_lines, linenum, error):
   """Checks the use of CHECK and EXPECT macros.
@@ -3176,24 +4138,8 @@
 
   # Decide the set of replacement macros that should be suggested
   lines = clean_lines.elided
-  check_macro = None
-  start_pos = -1
-  for macro in _CHECK_MACROS:
-    i = lines[linenum].find(macro)
-    if i >= 0:
-      check_macro = macro
-
-      # Find opening parenthesis.  Do a regular expression match here
-      # to make sure that we are matching the expected CHECK macro, as
-      # opposed to some other macro that happens to contain the CHECK
-      # substring.
-      matched = Match(r'^(.*\b' + check_macro + r'\s*)\(', lines[linenum])
-      if not matched:
-        continue
-      start_pos = len(matched.group(1))
-      break
-  if not check_macro or start_pos < 0:
-    # Don't waste time here if line doesn't contain 'CHECK' or 'EXPECT'
+  (check_macro, start_pos) = FindCheckMacro(lines[linenum])
+  if not check_macro:
     return
 
   # Find end of the boolean expression by matching parentheses
@@ -3201,6 +4147,13 @@
       clean_lines, linenum, start_pos)
   if end_pos < 0:
     return
+
+  # If the check macro is followed by something other than a
+  # semicolon, assume users will log their own custom error messages
+  # and don't suggest any replacements.
+  if not Match(r'\s*;', last_line[end_pos:]):
+    return
+
   if linenum == end_line:
     expression = lines[linenum][start_pos + 1:end_pos - 1]
   else:
@@ -3223,7 +4176,7 @@
       if token == '(':
         # Parenthesized operand
         expression = matched.group(2)
-        (end, _) = FindEndOfExpressionInLine(expression, 0, 1, '(', ')')
+        (end, _) = FindEndOfExpressionInLine(expression, 0, ['('])
         if end < 0:
           return  # Unmatched parenthesis
         lhs += '(' + expression[0:end]
@@ -3339,6 +4292,16 @@
       if unicodedata.east_asian_width(uc) in ('W', 'F'):
         width += 2
       elif not unicodedata.combining(uc):
+        # Issue 337
+        # https://mail.python.org/pipermail/python-list/2012-August/628809.html
+        if (sys.version_info.major, sys.version_info.minor) <= (3, 2):
+          # https://github.com/python/cpython/blob/2.7/Include/unicodeobject.h#L81
+          is_wide_build = sysconfig.get_config_var("Py_UNICODE_SIZE") >= 4
+          # https://github.com/python/cpython/blob/2.7/Objects/unicodeobject.c#L564
+          is_low_surrogate = 0xDC00 <= ord(uc) <= 0xDFFF
+          if not is_wide_build and is_low_surrogate:
+            width -= 1
+
         width += 1
     return width
   else:
@@ -3358,7 +4321,7 @@
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3368,6 +4331,7 @@
   # raw strings,
   raw_lines = clean_lines.lines_without_raw_strings
   line = raw_lines[linenum]
+  prev = raw_lines[linenum - 1] if linenum > 0 else ''
 
   if line.find('\t') != -1:
     error(filename, linenum, 'whitespace/tab', 1,
@@ -3385,23 +4349,33 @@
   # if(match($0, " <<")) complain = 0;
   # if(match(prev, " +for \\(")) complain = 0;
   # if(prevodd && match(prevprev, " +for \\(")) complain = 0;
+  scope_or_label_pattern = r'\s*\w+\s*:\s*\\?$'
+  classinfo = nesting_state.InnermostClass()
   initial_spaces = 0
   cleansed_line = clean_lines.elided[linenum]
   while initial_spaces < len(line) and line[initial_spaces] == ' ':
     initial_spaces += 1
-  if line and line[-1].isspace():
-    error(filename, linenum, 'whitespace/end_of_line', 4,
-          'Line ends in whitespace.  Consider deleting these extra spaces.')
-  # There are certain situations we allow one space, notably for section labels
-  elif ((initial_spaces == 1 or initial_spaces == 3) and
-        not Match(r'\s*\w+\s*:\s*$', cleansed_line)):
+  # There are certain situations we allow one space, notably for
+  # section labels, and also lines containing multi-line raw strings.
+  # We also don't check for lines that look like continuation lines
+  # (of lines ending in double quotes, commas, equals, or angle brackets)
+  # because the rules for how to indent those are non-trivial.
+  if (not Search(r'[",=><] *$', prev) and
+      (initial_spaces == 1 or initial_spaces == 3) and
+      not Match(scope_or_label_pattern, cleansed_line) and
+      not (clean_lines.raw_lines[linenum] != line and
+           Match(r'^\s*""', line))):
     error(filename, linenum, 'whitespace/indent', 3,
           'Weird number of spaces at line-start.  '
           'Are you using a 2-space indent?')
 
+  if line and line[-1].isspace():
+    error(filename, linenum, 'whitespace/end_of_line', 4,
+          'Line ends in whitespace.  Consider deleting these extra spaces.')
+
   # Check if the line is a header guard.
   is_header_guard = False
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     cppvar = GetHeaderGuardCPPVariable(filename)
     if (line.startswith('#ifndef %s' % cppvar) or
         line.startswith('#define %s' % cppvar) or
@@ -3417,14 +4391,10 @@
   # developers fault.
   if (not line.startswith('#include') and not is_header_guard and
       not Match(r'^\s*//.*http(s?)://\S*$', line) and
+      not Match(r'^\s*//\s*[^\s]*$', line) and
       not Match(r'^// \$Id:.*#[0-9]+ \$$', line)):
     line_width = GetLineWidth(line)
-    extended_length = int((_line_length * 1.25))
-    if line_width > extended_length:
-      error(filename, linenum, 'whitespace/line_length', 4,
-            'Lines should very rarely be longer than %i characters' %
-            extended_length)
-    elif line_width > _line_length:
+    if line_width > _line_length:
       error(filename, linenum, 'whitespace/line_length', 2,
             'Lines should be <= %i characters long' % _line_length)
 
@@ -3442,9 +4412,14 @@
 
   # Some more style checks
   CheckBraces(filename, clean_lines, linenum, error)
+  CheckTrailingSemicolon(filename, clean_lines, linenum, error)
   CheckEmptyBlockBody(filename, clean_lines, linenum, error)
-  CheckAccess(filename, clean_lines, linenum, nesting_state, error)
   CheckSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckOperatorSpacing(filename, clean_lines, linenum, error)
+  CheckParenthesisSpacing(filename, clean_lines, linenum, error)
+  CheckCommaSpacing(filename, clean_lines, linenum, error)
+  CheckBracesSpacing(filename, clean_lines, linenum, nesting_state, error)
+  CheckSpacingForFunctionCall(filename, clean_lines, linenum, error)
   CheckCheck(filename, clean_lines, linenum, error)
   CheckAltTokens(filename, clean_lines, linenum, error)
   classinfo = nesting_state.InnermostClass()
@@ -3452,7 +4427,6 @@
     CheckSectionSpacing(filename, clean_lines, classinfo, linenum, error)
 
 
-_RE_PATTERN_INCLUDE_NEW_STYLE = re.compile(r'#include +"[^/]+\.h"')
 _RE_PATTERN_INCLUDE = re.compile(r'^\s*#\s*include\s*([<"])([^>"]*)[>"].*$')
 # Matches the first component of a filename delimited by -s and _s. That is:
 #  _RE_FIRST_COMPONENT.match('foo').group(0) == 'foo'
@@ -3489,23 +4463,6 @@
   return os.path.splitext(filename)[0]
 
 
-def _IsTestFilename(filename):
-  """Determines if the given filename has a suffix that identifies it as a test.
-
-  Args:
-    filename: The input filename.
-
-  Returns:
-    True if 'filename' looks like a test, False otherwise.
-  """
-  if (filename.endswith('_test.cc') or
-      filename.endswith('_unittest.cc') or
-      filename.endswith('_regtest.cc')):
-    return True
-  else:
-    return False
-
-
 def _ClassifyInclude(fileinfo, include, is_system):
   """Figures out what kind of header 'include' is.
 
@@ -3581,11 +4538,17 @@
     error: The function to call with any errors found.
   """
   fileinfo = FileInfo(filename)
-
   line = clean_lines.lines[linenum]
 
   # "include" should use the new style "foo/bar.h" instead of just "bar.h"
-  if _RE_PATTERN_INCLUDE_NEW_STYLE.search(line):
+  # Only do this check if the included header follows google naming
+  # conventions.  If not, assume that it's a 3rd party API that
+  # requires special include conventions.
+  #
+  # We also make an exception for Lua headers, which follow google
+  # naming convention but not the include convention.
+  match = Match(r'#include\s*"([^/]+\.h)"', line)
+  if match and not _THIRD_PARTY_HEADERS_PATTERN.match(match.group(1)):
     error(filename, linenum, 'build/include', 4,
           'Include the directory when naming .h files')
 
@@ -3596,12 +4559,17 @@
   if match:
     include = match.group(2)
     is_system = (match.group(1) == '<')
-    if include in include_state:
+    duplicate_line = include_state.FindHeader(include)
+    if duplicate_line >= 0:
       error(filename, linenum, 'build/include', 4,
             '"%s" already included at %s:%s' %
-            (include, filename, include_state[include]))
-    else:
-      include_state[include] = linenum
+            (include, filename, duplicate_line))
+    elif (include.endswith('.cc') and
+          os.path.dirname(fileinfo.RepositoryName()) != os.path.dirname(include)):
+      error(filename, linenum, 'build/include', 4,
+            'Do not include .cc files from other packages')
+    elif not _THIRD_PARTY_HEADERS_PATTERN.match(include):
+      include_state.include_list[-1].append((include, linenum))
 
       # We want to ensure that headers appear in the right order:
       # 1) for foo.cc, foo.h  (preferred location)
@@ -3627,15 +4595,6 @@
               'Include "%s" not in alphabetical order' % include)
       include_state.SetLastHeader(canonical_include)
 
-  # Look for any of the stream classes that are part of standard C++.
-  match = _RE_PATTERN_INCLUDE.match(line)
-  if match:
-    include = match.group(2)
-    if Match(r'(f|ind|io|i|o|parse|pf|stdio|str|)?stream$', include):
-      # Many unit tests use cout, so we exempt them.
-      if not _IsTestFilename(filename):
-        error(filename, linenum, 'readability/streams', 3,
-              'Streams are highly discouraged.')
 
 
 def _GetTextInside(text, start_pattern):
@@ -3658,7 +4617,7 @@
     The extracted text.
     None if either the opening string or ending punctuation could not be found.
   """
-  # TODO(sugawarayu): Audit cpplint.py to see what places could be profitably
+  # TODO(unknown): Audit cpplint.py to see what places could be profitably
   # rewritten to use _GetTextInside (and use inferior regexp matching today).
 
   # Give opening punctuations to get the matching close-punctuations.
@@ -3718,6 +4677,9 @@
 _RE_PATTERN_CONST_REF_PARAM = (
     r'(?:.*\s*\bconst\s*&\s*' + _RE_PATTERN_IDENT +
     r'|const\s+' + _RE_PATTERN_TYPE + r'\s*&\s*' + _RE_PATTERN_IDENT + r')')
+# Stream types.
+_RE_PATTERN_REF_STREAM_PARAM = (
+    r'(?:.*stream\s*&\s*' + _RE_PATTERN_IDENT + r')')
 
 
 def CheckLanguage(filename, clean_lines, linenum, file_extension,
@@ -3733,7 +4695,7 @@
     linenum: The number of the line to check.
     file_extension: The extension (without the dot) of the filename.
     include_state: An _IncludeState instance in which the headers are inserted.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -3750,129 +4712,23 @@
 
   # Reset include state across preprocessor directives.  This is meant
   # to silence warnings for conditional includes.
-  if Match(r'^\s*#\s*(?:ifdef|elif|else|endif)\b', line):
-    include_state.ResetSection()
+  match = Match(r'^\s*#\s*(if|ifdef|ifndef|elif|else|endif)\b', line)
+  if match:
+    include_state.ResetSection(match.group(1))
 
   # Make Windows paths like Unix.
   fullname = os.path.abspath(filename).replace('\\', '/')
 
-  # TODO(unknown): figure out if they're using default arguments in fn proto.
+  # Perform other checks now that we are sure that this is not an include line
+  CheckCasts(filename, clean_lines, linenum, error)
+  CheckGlobalStatic(filename, clean_lines, linenum, error)
+  CheckPrintf(filename, clean_lines, linenum, error)
 
-  # Check to see if they're using an conversion function cast.
-  # I just try to capture the most common basic types, though there are more.
-  # Parameterless conversion functions, such as bool(), are allowed as they are
-  # probably a member operator declaration or default constructor.
-  match = Search(
-      r'(\bnew\s+)?\b'  # Grab 'new' operator, if it's there
-      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
-      r'(\([^)].*)', line)
-  if match:
-    matched_new = match.group(1)
-    matched_type = match.group(2)
-    matched_funcptr = match.group(3)
-
-    # gMock methods are defined using some variant of MOCK_METHODx(name, type)
-    # where type may be float(), int(string), etc.  Without context they are
-    # virtually indistinguishable from int(x) casts. Likewise, gMock's
-    # MockCallback takes a template parameter of the form return_type(arg_type),
-    # which looks much like the cast we're trying to detect.
-    #
-    # std::function<> wrapper has a similar problem.
-    #
-    # Return types for function pointers also look like casts if they
-    # don't have an extra space.
-    if (matched_new is None and  # If new operator, then this isn't a cast
-        not (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
-             Search(r'\bMockCallback<.*>', line) or
-             Search(r'\bstd::function<.*>', line)) and
-        not (matched_funcptr and
-             Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
-                   matched_funcptr))):
-      # Try a bit harder to catch gmock lines: the only place where
-      # something looks like an old-style cast is where we declare the
-      # return type of the mocked method, and the only time when we
-      # are missing context is if MOCK_METHOD was split across
-      # multiple lines.  The missing MOCK_METHOD is usually one or two
-      # lines back, so scan back one or two lines.
-      #
-      # It's not possible for gmock macros to appear in the first 2
-      # lines, since the class head + section name takes up 2 lines.
-      if (linenum < 2 or
-          not (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
-                     clean_lines.elided[linenum - 1]) or
-               Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
-                     clean_lines.elided[linenum - 2]))):
-        error(filename, linenum, 'readability/casting', 4,
-              'Using deprecated casting style.  '
-              'Use static_cast<%s>(...) instead' %
-              matched_type)
-
-  CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                  'static_cast',
-                  r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
-
-  # This doesn't catch all cases. Consider (const char * const)"hello".
-  #
-  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
-  # compile).
-  if CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                     'const_cast', r'\((char\s?\*+\s?)\)\s*"', error):
-    pass
-  else:
-    # Check pointer casts for other than string constants
-    CheckCStyleCast(filename, linenum, line, clean_lines.raw_lines[linenum],
-                    'reinterpret_cast', r'\((\w+\s?\*+\s?)\)', error)
-
-  # In addition, we look for people taking the address of a cast.  This
-  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
-  # point where you think.
-  match = Search(
-      r'(?:&\(([^)]+)\)[\w(])|'
-      r'(?:&(static|dynamic|down|reinterpret)_cast\b)', line)
-  if match and match.group(1) != '*':
-    error(filename, linenum, 'runtime/casting', 4,
-          ('Are you taking an address of a cast?  '
-           'This is dangerous: could be a temp var.  '
-           'Take the address before doing the cast, rather than after'))
-
-  # Create an extended_line, which is the concatenation of the current and
-  # next lines, for more effective checking of code that may span more than one
-  # line.
-  if linenum + 1 < clean_lines.NumLines():
-    extended_line = line + clean_lines.elided[linenum + 1]
-  else:
-    extended_line = line
-
-  # Check for people declaring static/global STL strings at the top level.
-  # This is dangerous because the C++ language does not guarantee that
-  # globals with constructors are initialized before the first access.
-  match = Match(
-      r'((?:|static +)(?:|const +))string +([a-zA-Z0-9_:]+)\b(.*)',
-      line)
-  # Make sure it's not a function.
-  # Function template specialization looks like: "string foo<Type>(...".
-  # Class template definitions look like: "string Foo<Type>::Method(...".
-  #
-  # Also ignore things that look like operators.  These are matched separately
-  # because operator names cross non-word boundaries.  If we change the pattern
-  # above, we would decrease the accuracy of matching identifiers.
-  if (match and
-      not Search(r'\boperator\W', line) and
-      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)?\s*\(([^"]|$)', match.group(3))):
-    error(filename, linenum, 'runtime/string', 4,
-          'For a static/global string constant, use a C style string instead: '
-          '"%schar %s[]".' %
-          (match.group(1), match.group(2)))
-
-  if Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line):
-    error(filename, linenum, 'runtime/init', 4,
-          'You seem to be initializing a member variable with itself.')
-
-  if file_extension == 'h':
+  if IsHeaderExtension(file_extension):
     # TODO(unknown): check that 1-arg constructors are explicit.
     #                How to tell it's a constructor?
     #                (handled in CheckForNonStandardConstructs for now)
-    # TODO(unknown): check that classes have DISALLOW_EVIL_CONSTRUCTORS
+    # TODO(unknown): check that classes declare or disable copy/assign
     #                (level 1 error)
     pass
 
@@ -3888,23 +4744,6 @@
       error(filename, linenum, 'runtime/int', 4,
             'Use int16/int64/etc, rather than the C type %s' % match.group(1))
 
-  # When snprintf is used, the second argument shouldn't be a literal.
-  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
-  if match and match.group(2) != '0':
-    # If 2nd arg is zero, snprintf is used to calculate size.
-    error(filename, linenum, 'runtime/printf', 3,
-          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
-          'to snprintf.' % (match.group(1), match.group(2)))
-
-  # Check if some verboten C functions are being used.
-  if Search(r'\bsprintf\b', line):
-    error(filename, linenum, 'runtime/printf', 5,
-          'Never use sprintf.  Use snprintf instead.')
-  match = Search(r'\b(strcpy|strcat)\b', line)
-  if match:
-    error(filename, linenum, 'runtime/printf', 4,
-          'Almost always, snprintf is better than %s' % match.group(1))
-
   # Check if some verboten operator overloading is going on
   # TODO(unknown): catch out-of-line unary operator&:
   #   class X {};
@@ -3924,7 +4763,7 @@
   # Check for potential format string bugs like printf(foo).
   # We constrain the pattern not to pick things like DocidForPrintf(foo).
   # Not perfect but it can catch printf(foo.c_str()) and printf(foo->c_str())
-  # TODO(sugawarayu): Catch the following case. Need to change the calling
+  # TODO(unknown): Catch the following case. Need to change the calling
   # convention of the whole function to process multiple line to handle it.
   #   printf(
   #       boy_this_is_a_really_long_variable_that_cannot_fit_on_the_prev_line);
@@ -3989,37 +4828,188 @@
             'Do not use variable-length arrays.  Use an appropriately named '
             "('k' followed by CamelCase) compile-time constant for the size.")
 
-  # If DISALLOW_EVIL_CONSTRUCTORS, DISALLOW_COPY_AND_ASSIGN, or
-  # DISALLOW_IMPLICIT_CONSTRUCTORS is present, then it should be the last thing
-  # in the class declaration.
-  match = Match(
-      (r'\s*'
-       r'(DISALLOW_(EVIL_CONSTRUCTORS|COPY_AND_ASSIGN|IMPLICIT_CONSTRUCTORS))'
-       r'\(.*\);$'),
-      line)
-  if match and linenum + 1 < clean_lines.NumLines():
-    next_line = clean_lines.elided[linenum + 1]
-    # We allow some, but not all, declarations of variables to be present
-    # in the statement that defines the class.  The [\w\*,\s]* fragment of
-    # the regular expression below allows users to declare instances of
-    # the class or pointers to instances, but not less common types such
-    # as function pointers or arrays.  It's a tradeoff between allowing
-    # reasonable code and avoiding trying to parse more C++ using regexps.
-    if not Search(r'^\s*}[\w\*,\s]*;', next_line):
-      error(filename, linenum, 'readability/constructors', 3,
-            match.group(1) + ' should be the last thing in the class')
-
   # Check for use of unnamed namespaces in header files.  Registration
   # macros are typically OK, so we allow use of "namespace {" on lines
   # that end with backslashes.
-  if (file_extension == 'h'
+  if (IsHeaderExtension(file_extension)
       and Search(r'\bnamespace\s*{', line)
       and line[-1] != '\\'):
     error(filename, linenum, 'build/namespaces', 4,
           'Do not use unnamed namespaces in header files.  See '
-          'http://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
+          'https://google-styleguide.googlecode.com/svn/trunk/cppguide.xml#Namespaces'
           ' for more information.')
 
+
+def CheckGlobalStatic(filename, clean_lines, linenum, error):
+  """Check for unsafe global or static objects.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Match two lines at a time to support multiline declarations
+  if linenum + 1 < clean_lines.NumLines() and not Search(r'[;({]', line):
+    line += clean_lines.elided[linenum + 1].strip()
+
+  # Check for people declaring static/global STL strings at the top level.
+  # This is dangerous because the C++ language does not guarantee that
+  # globals with constructors are initialized before the first access, and
+  # also because globals can be destroyed when some threads are still running.
+  # TODO(unknown): Generalize this to also find static unique_ptr instances.
+  # TODO(unknown): File bugs for clang-tidy to find these.
+  match = Match(
+      r'((?:|static +)(?:|const +))(?::*std::)?string( +const)? +'
+      r'([a-zA-Z0-9_:]+)\b(.*)',
+      line)
+
+  # Remove false positives:
+  # - String pointers (as opposed to values).
+  #    string *pointer
+  #    const string *pointer
+  #    string const *pointer
+  #    string *const pointer
+  #
+  # - Functions and template specializations.
+  #    string Function<Type>(...
+  #    string Class<Type>::Method(...
+  #
+  # - Operators.  These are matched separately because operator names
+  #   cross non-word boundaries, and trying to match both operators
+  #   and functions at the same time would decrease accuracy of
+  #   matching identifiers.
+  #    string Class::operator*()
+  if (match and
+      not Search(r'\bstring\b(\s+const)?\s*[\*\&]\s*(const\s+)?\w', line) and
+      not Search(r'\boperator\W', line) and
+      not Match(r'\s*(<.*>)?(::[a-zA-Z0-9_]+)*\s*\(([^"]|$)', match.group(4))):
+    if Search(r'\bconst\b', line):
+      error(filename, linenum, 'runtime/string', 4,
+            'For a static/global string constant, use a C style string '
+            'instead: "%schar%s %s[]".' %
+            (match.group(1), match.group(2) or '', match.group(3)))
+    else:
+      error(filename, linenum, 'runtime/string', 4,
+            'Static/global string variables are not permitted.')
+
+  if (Search(r'\b([A-Za-z0-9_]*_)\(\1\)', line) or
+      Search(r'\b([A-Za-z0-9_]*_)\(CHECK_NOTNULL\(\1\)\)', line)):
+    error(filename, linenum, 'runtime/init', 4,
+          'You seem to be initializing a member variable with itself.')
+
+
+def CheckPrintf(filename, clean_lines, linenum, error):
+  """Check for printf related issues.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # When snprintf is used, the second argument shouldn't be a literal.
+  match = Search(r'snprintf\s*\(([^,]*),\s*([0-9]*)\s*,', line)
+  if match and match.group(2) != '0':
+    # If 2nd arg is zero, snprintf is used to calculate size.
+    error(filename, linenum, 'runtime/printf', 3,
+          'If you can, use sizeof(%s) instead of %s as the 2nd arg '
+          'to snprintf.' % (match.group(1), match.group(2)))
+
+  # Check if some verboten C functions are being used.
+  if Search(r'\bsprintf\s*\(', line):
+    error(filename, linenum, 'runtime/printf', 5,
+          'Never use sprintf. Use snprintf instead.')
+  match = Search(r'\b(strcpy|strcat)\s*\(', line)
+  if match:
+    error(filename, linenum, 'runtime/printf', 4,
+          'Almost always, snprintf is better than %s' % match.group(1))
+
+
+def IsDerivedFunction(clean_lines, linenum):
+  """Check if current line contains an inherited function.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains a function with "override"
+    virt-specifier.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    match = Match(r'^([^()]*\w+)\(', clean_lines.elided[i])
+    if match:
+      # Look for "override" after the matching closing parenthesis
+      line, _, closing_paren = CloseExpression(
+          clean_lines, i, len(match.group(1)))
+      return (closing_paren >= 0 and
+              Search(r'\boverride\b', line[closing_paren:]))
+  return False
+
+
+def IsOutOfLineMethodDefinition(clean_lines, linenum):
+  """Check if current line contains an out-of-line method definition.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line contains an out-of-line method definition.
+  """
+  # Scan back a few lines for start of current function
+  for i in xrange(linenum, max(-1, linenum - 10), -1):
+    if Match(r'^([^()]*\w+)\(', clean_lines.elided[i]):
+      return Match(r'^[^()]*\w+::\w+\(', clean_lines.elided[i]) is not None
+  return False
+
+
+def IsInitializerList(clean_lines, linenum):
+  """Check if current line is inside constructor initializer list.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+  Returns:
+    True if current line appears to be inside constructor initializer
+    list, False otherwise.
+  """
+  for i in xrange(linenum, 1, -1):
+    line = clean_lines.elided[i]
+    if i == linenum:
+      remove_function_body = Match(r'^(.*)\{\s*$', line)
+      if remove_function_body:
+        line = remove_function_body.group(1)
+
+    if Search(r'\s:\s*\w+[({]', line):
+      # A lone colon tend to indicate the start of a constructor
+      # initializer list.  It could also be a ternary operator, which
+      # also tend to appear in constructor initializer lists as
+      # opposed to parameter lists.
+      return True
+    if Search(r'\}\s*,\s*$', line):
+      # A closing brace followed by a comma is probably the end of a
+      # brace-initialized member in constructor initializer list.
+      return True
+    if Search(r'[{};]\s*$', line):
+      # Found one of the following:
+      # - A closing brace or semicolon, probably the end of the previous
+      #   function.
+      # - An opening brace, probably the start of current class or namespace.
+      #
+      # Current line is probably not inside an initializer list since
+      # we saw one of those things without seeing the starting colon.
+      return False
+
+  # Got to the beginning of the file without seeing the start of
+  # constructor initializer list.
+  return False
+
+
 def CheckForNonConstReference(filename, clean_lines, linenum,
                               nesting_state, error):
   """Check for non-const references.
@@ -4031,7 +5021,7 @@
     filename: The name of the current file.
     clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: The function to call with any errors found.
   """
@@ -4040,6 +5030,17 @@
   if '&' not in line:
     return
 
+  # If a function is inherited, current function doesn't have much of
+  # a choice, so any non-const references should not be blamed on
+  # derived function.
+  if IsDerivedFunction(clean_lines, linenum):
+    return
+
+  # Don't warn on out-of-line method definitions, as we would warn on the
+  # in-line declaration, if it isn't marked with 'override'.
+  if IsOutOfLineMethodDefinition(clean_lines, linenum):
+    return
+
   # Long type names may be broken across multiple lines, usually in one
   # of these forms:
   #   LongType
@@ -4088,60 +5089,192 @@
   #   inside declarators: reference parameter
   # We will exclude the first two cases by checking that we are not inside a
   # function body, including one that was just introduced by a trailing '{'.
-  # TODO(unknwon): Doesn't account for preprocessor directives.
   # TODO(unknown): Doesn't account for 'catch(Exception& e)' [rare].
-  check_params = False
-  if not nesting_state.stack:
-    check_params = True  # top level
-  elif (isinstance(nesting_state.stack[-1], _ClassInfo) or
-        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
-    check_params = True  # within class or namespace
-  elif Match(r'.*{\s*$', line):
-    if (len(nesting_state.stack) == 1 or
-        isinstance(nesting_state.stack[-2], _ClassInfo) or
-        isinstance(nesting_state.stack[-2], _NamespaceInfo)):
-      check_params = True  # just opened global/class/namespace block
+  if (nesting_state.previous_stack_top and
+      not (isinstance(nesting_state.previous_stack_top, _ClassInfo) or
+           isinstance(nesting_state.previous_stack_top, _NamespaceInfo))):
+    # Not at toplevel, not within a class, and not within a namespace
+    return
+
+  # Avoid initializer lists.  We only need to scan back from the
+  # current line for something that starts with ':'.
+  #
+  # We don't need to check the current line, since the '&' would
+  # appear inside the second set of parentheses on the current line as
+  # opposed to the first set.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 10), -1):
+      previous_line = clean_lines.elided[i]
+      if not Search(r'[),]\s*$', previous_line):
+        break
+      if Match(r'^\s*:\s+\S', previous_line):
+        return
+
+  # Avoid preprocessors
+  if Search(r'\\\s*$', line):
+    return
+
+  # Avoid constructor initializer lists
+  if IsInitializerList(clean_lines, linenum):
+    return
+
   # We allow non-const references in a few standard places, like functions
   # called "swap()" or iostream operators like "<<" or ">>".  Do not check
   # those function parameters.
   #
   # We also accept & in static_assert, which looks like a function but
   # it's actually a declaration expression.
-  whitelisted_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
+  allowed_functions = (r'(?:[sS]wap(?:<\w:+>)?|'
                            r'operator\s*[<>][<>]|'
                            r'static_assert|COMPILE_ASSERT'
                            r')\s*\(')
-  if Search(whitelisted_functions, line):
-    check_params = False
+  if Search(allowed_functions, line):
+    return
   elif not Search(r'\S+\([^)]*$', line):
-    # Don't see a whitelisted function on this line.  Actually we
+    # Don't see an allowed function on this line.  Actually we
     # didn't see any function name on this line, so this is likely a
     # multi-line parameter list.  Try a bit harder to catch this case.
     for i in xrange(2):
       if (linenum > i and
-          Search(whitelisted_functions, clean_lines.elided[linenum - i - 1])):
-        check_params = False
-        break
+          Search(allowed_functions, clean_lines.elided[linenum - i - 1])):
+        return
 
-  if check_params:
-    decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
-    for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
-      if not Match(_RE_PATTERN_CONST_REF_PARAM, parameter):
-        error(filename, linenum, 'runtime/references', 2,
-              'Is this a non-const reference? '
-              'If so, make const or use a pointer: ' +
-              ReplaceAll(' *<', '<', parameter))
+  decls = ReplaceAll(r'{[^}]*}', ' ', line)  # exclude function body
+  for parameter in re.findall(_RE_PATTERN_REF_PARAM, decls):
+    if (not Match(_RE_PATTERN_CONST_REF_PARAM, parameter) and
+        not Match(_RE_PATTERN_REF_STREAM_PARAM, parameter)):
+      error(filename, linenum, 'runtime/references', 2,
+            'Is this a non-const reference? '
+            'If so, make const or use a pointer: ' +
+            ReplaceAll(' *<', '<', parameter))
 
 
-def CheckCStyleCast(filename, linenum, line, raw_line, cast_type, pattern,
-                    error):
+def CheckCasts(filename, clean_lines, linenum, error):
+  """Various cast related checks.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  # Check to see if they're using an conversion function cast.
+  # I just try to capture the most common basic types, though there are more.
+  # Parameterless conversion functions, such as bool(), are allowed as they are
+  # probably a member operator declaration or default constructor.
+  match = Search(
+      r'(\bnew\s+(?:const\s+)?|\S<\s*(?:const\s+)?)?\b'
+      r'(int|float|double|bool|char|int32|uint32|int64|uint64)'
+      r'(\([^)].*)', line)
+  expecting_function = ExpectingFunctionArgs(clean_lines, linenum)
+  if match and not expecting_function:
+    matched_type = match.group(2)
+
+    # matched_new_or_template is used to silence two false positives:
+    # - New operators
+    # - Template arguments with function types
+    #
+    # For template arguments, we match on types immediately following
+    # an opening bracket without any spaces.  This is a fast way to
+    # silence the common case where the function type is the first
+    # template argument.  False negative with less-than comparison is
+    # avoided because those operators are usually followed by a space.
+    #
+    #   function<double(double)>   // bracket + no space = false positive
+    #   value < double(42)         // bracket + space = true positive
+    matched_new_or_template = match.group(1)
+
+    # Avoid arrays by looking for brackets that come after the closing
+    # parenthesis.
+    if Match(r'\([^()]+\)\s*\[', match.group(3)):
+      return
+
+    # Other things to ignore:
+    # - Function pointers
+    # - Casts to pointer types
+    # - Placement new
+    # - Alias declarations
+    matched_funcptr = match.group(3)
+    if (matched_new_or_template is None and
+        not (matched_funcptr and
+             (Match(r'\((?:[^() ]+::\s*\*\s*)?[^() ]+\)\s*\(',
+                    matched_funcptr) or
+              matched_funcptr.startswith('(*)'))) and
+        not Match(r'\s*using\s+\S+\s*=\s*' + matched_type, line) and
+        not Search(r'new\(\S+\)\s*' + matched_type, line)):
+      error(filename, linenum, 'readability/casting', 4,
+            'Using deprecated casting style.  '
+            'Use static_cast<%s>(...) instead' %
+            matched_type)
+
+  if not expecting_function:
+    CheckCStyleCast(filename, clean_lines, linenum, 'static_cast',
+                    r'\((int|float|double|bool|char|u?int(16|32|64))\)', error)
+
+  # This doesn't catch all cases. Consider (const char * const)"hello".
+  #
+  # (char *) "foo" should always be a const_cast (reinterpret_cast won't
+  # compile).
+  if CheckCStyleCast(filename, clean_lines, linenum, 'const_cast',
+                     r'\((char\s?\*+\s?)\)\s*"', error):
+    pass
+  else:
+    # Check pointer casts for other than string constants
+    CheckCStyleCast(filename, clean_lines, linenum, 'reinterpret_cast',
+                    r'\((\w+\s?\*+\s?)\)', error)
+
+  # In addition, we look for people taking the address of a cast.  This
+  # is dangerous -- casts can assign to temporaries, so the pointer doesn't
+  # point where you think.
+  #
+  # Some non-identifier character is required before the '&' for the
+  # expression to be recognized as a cast.  These are casts:
+  #   expression = &static_cast<int*>(temporary());
+  #   function(&(int*)(temporary()));
+  #
+  # This is not a cast:
+  #   reference_type&(int* function_param);
+  match = Search(
+      r'(?:[^\w]&\(([^)*][^)]*)\)[\w(])|'
+      r'(?:[^\w]&(static|dynamic|down|reinterpret)_cast\b)', line)
+  if match:
+    # Try a better error message when the & is bound to something
+    # dereferenced by the casted pointer, as opposed to the casted
+    # pointer itself.
+    parenthesis_error = False
+    match = Match(r'^(.*&(?:static|dynamic|down|reinterpret)_cast\b)<', line)
+    if match:
+      _, y1, x1 = CloseExpression(clean_lines, linenum, len(match.group(1)))
+      if x1 >= 0 and clean_lines.elided[y1][x1] == '(':
+        _, y2, x2 = CloseExpression(clean_lines, y1, x1)
+        if x2 >= 0:
+          extended_line = clean_lines.elided[y2][x2:]
+          if y2 < clean_lines.NumLines() - 1:
+            extended_line += clean_lines.elided[y2 + 1]
+          if Match(r'\s*(?:->|\[)', extended_line):
+            parenthesis_error = True
+
+    if parenthesis_error:
+      error(filename, linenum, 'readability/casting', 4,
+            ('Are you taking an address of something dereferenced '
+             'from a cast?  Wrapping the dereferenced expression in '
+             'parentheses will make the binding more obvious'))
+    else:
+      error(filename, linenum, 'runtime/casting', 4,
+            ('Are you taking an address of a cast?  '
+             'This is dangerous: could be a temp var.  '
+             'Take the address before doing the cast, rather than after'))
+
+
+def CheckCStyleCast(filename, clean_lines, linenum, cast_type, pattern, error):
   """Checks for a C-style cast by looking for the pattern.
 
   Args:
     filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
     linenum: The number of the line to check.
-    line: The line of code to check.
-    raw_line: The raw line of code to check, with comments.
     cast_type: The string for the C++ cast to recommend.  This is either
       reinterpret_cast, static_cast, or const_cast, depending.
     pattern: The regular expression used to find C-style casts.
@@ -4151,75 +5284,34 @@
     True if an error was emitted.
     False otherwise.
   """
+  line = clean_lines.elided[linenum]
   match = Search(pattern, line)
   if not match:
     return False
 
-  # e.g., sizeof(int)
-  sizeof_match = Match(r'.*sizeof\s*$', line[0:match.start(1) - 1])
-  if sizeof_match:
-    error(filename, linenum, 'runtime/sizeof', 1,
-          'Using sizeof(type).  Use sizeof(varname) instead if possible')
-    return True
-
-  # operator++(int) and operator--(int)
-  if (line[0:match.start(1) - 1].endswith(' operator++') or
-      line[0:match.start(1) - 1].endswith(' operator--')):
+  # Exclude lines with keywords that tend to look like casts
+  context = line[0:match.start(1) - 1]
+  if Match(r'.*\b(?:sizeof|alignof|alignas|[_A-Z][_A-Z0-9]*)\s*$', context):
     return False
 
-  # A single unnamed argument for a function tends to look like old
-  # style cast.  If we see those, don't issue warnings for deprecated
-  # casts, instead issue warnings for unnamed arguments where
-  # appropriate.
-  #
-  # These are things that we want warnings for, since the style guide
-  # explicitly require all parameters to be named:
-  #   Function(int);
-  #   Function(int) {
-  #   ConstMember(int) const;
-  #   ConstMember(int) const {
-  #   ExceptionMember(int) throw (...);
-  #   ExceptionMember(int) throw (...) {
-  #   PureVirtual(int) = 0;
-  #
-  # These are functions of some sort, where the compiler would be fine
-  # if they had named parameters, but people often omit those
-  # identifiers to reduce clutter:
-  #   (FunctionPointer)(int);
-  #   (FunctionPointer)(int) = value;
-  #   Function((function_pointer_arg)(int))
-  #   <TemplateArgument(int)>;
-  #   <(FunctionPointerTemplateArgument)(int)>;
+  # Try expanding current context to see if we one level of
+  # parentheses inside a macro.
+  if linenum > 0:
+    for i in xrange(linenum - 1, max(0, linenum - 5), -1):
+      context = clean_lines.elided[i] + context
+  if Match(r'.*\b[_A-Z][_A-Z0-9]*\s*\((?:\([^()]*\)|[^()])*$', context):
+    return False
+
+  # operator++(int) and operator--(int)
+  if context.endswith(' operator++') or context.endswith(' operator--'):
+    return False
+
+  # A single unnamed argument for a function tends to look like old style cast.
+  # If we see those, don't issue warnings for deprecated casts.
   remainder = line[match.end(0):]
-  if Match(r'^\s*(?:;|const\b|throw\b|=|>|\{|\))', remainder):
-    # Looks like an unnamed parameter.
-
-    # Don't warn on any kind of template arguments.
-    if Match(r'^\s*>', remainder):
-      return False
-
-    # Don't warn on assignments to function pointers, but keep warnings for
-    # unnamed parameters to pure virtual functions.  Note that this pattern
-    # will also pass on assignments of "0" to function pointers, but the
-    # preferred values for those would be "nullptr" or "NULL".
-    matched_zero = Match(r'^\s=\s*(\S+)\s*;', remainder)
-    if matched_zero and matched_zero.group(1) != '0':
-      return False
-
-    # Don't warn on function pointer declarations.  For this we need
-    # to check what came before the "(type)" string.
-    if Match(r'.*\)\s*$', line[0:match.start(0)]):
-      return False
-
-    # Don't warn if the parameter is named with block comments, e.g.:
-    #  Function(int /*unused_param*/);
-    if '/*' in raw_line:
-      return False
-
-    # Passed all filters, issue warning here.
-    error(filename, linenum, 'readability/function', 3,
-          'All parameters should be named in a function')
-    return True
+  if Match(r'^\s*(?:;|const\b|throw\b|final\b|override\b|[=>{),]|->)',
+           remainder):
+    return False
 
   # At this point, all that should be left is actual casts.
   error(filename, linenum, 'readability/casting', 4,
@@ -4229,6 +5321,28 @@
   return True
 
 
+def ExpectingFunctionArgs(clean_lines, linenum):
+  """Checks whether where function type arguments are expected.
+
+  Args:
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+
+  Returns:
+    True if the line at 'linenum' is inside something that expects arguments
+    of function types.
+  """
+  line = clean_lines.elided[linenum]
+  return (Match(r'^\s*MOCK_(CONST_)?METHOD\d+(_T)?\(', line) or
+          (linenum >= 2 and
+           (Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\((?:\S+,)?\s*$',
+                  clean_lines.elided[linenum - 1]) or
+            Match(r'^\s*MOCK_(?:CONST_)?METHOD\d+(?:_T)?\(\s*$',
+                  clean_lines.elided[linenum - 2]) or
+            Search(r'\bstd::m?function\s*\<\s*$',
+                   clean_lines.elided[linenum - 1]))))
+
+
 _HEADERS_CONTAINING_TEMPLATES = (
     ('<deque>', ('deque',)),
     ('<functional>', ('unary_function', 'binary_function',
@@ -4251,11 +5365,15 @@
     ('<limits>', ('numeric_limits',)),
     ('<list>', ('list',)),
     ('<map>', ('map', 'multimap',)),
-    ('<memory>', ('allocator',)),
+    ('<memory>', ('allocator', 'make_shared', 'make_unique', 'shared_ptr',
+                  'unique_ptr', 'weak_ptr')),
     ('<queue>', ('queue', 'priority_queue',)),
     ('<set>', ('set', 'multiset',)),
     ('<stack>', ('stack',)),
     ('<string>', ('char_traits', 'basic_string',)),
+    ('<tuple>', ('tuple',)),
+    ('<unordered_map>', ('unordered_map', 'unordered_multimap')),
+    ('<unordered_set>', ('unordered_set', 'unordered_multiset')),
     ('<utility>', ('pair',)),
     ('<vector>', ('vector',)),
 
@@ -4266,18 +5384,26 @@
     ('<slist>', ('slist',)),
     )
 
+_HEADERS_MAYBE_TEMPLATES = (
+    ('<algorithm>', ('copy', 'max', 'min', 'min_element', 'sort',
+                     'transform',
+                    )),
+    ('<utility>', ('forward', 'make_pair', 'move', 'swap')),
+    )
+
 _RE_PATTERN_STRING = re.compile(r'\bstring\b')
 
-_re_pattern_algorithm_header = []
-for _template in ('copy', 'max', 'min', 'min_element', 'sort', 'swap',
-                  'transform'):
-  # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
-  # type::max().
-  _re_pattern_algorithm_header.append(
-      (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
-       _template,
-       '<algorithm>'))
+_re_pattern_headers_maybe_templates = []
+for _header, _templates in _HEADERS_MAYBE_TEMPLATES:
+  for _template in _templates:
+    # Match max<type>(..., ...), max(..., ...), but not foo->max, foo.max or
+    # type::max().
+    _re_pattern_headers_maybe_templates.append(
+        (re.compile(r'[^>.]\b' + _template + r'(<.*?>)?\([^\)]'),
+            _template,
+            _header))
 
+# Other scripts may reach in and modify this pattern.
 _re_pattern_templates = []
 for _header, _templates in _HEADERS_CONTAINING_TEMPLATES:
   for _template in _templates:
@@ -4317,13 +5443,13 @@
     string: the additional prefix needed to open the header file.
   """
 
-  if not filename_cc.endswith('.cc'):
+  fileinfo = FileInfo(filename_cc)
+  if not fileinfo.IsSource():
     return (False, '')
-  filename_cc = filename_cc[:-len('.cc')]
-  if filename_cc.endswith('_unittest'):
-    filename_cc = filename_cc[:-len('_unittest')]
-  elif filename_cc.endswith('_test'):
-    filename_cc = filename_cc[:-len('_test')]
+  filename_cc = filename_cc[:-len(fileinfo.Extension())]
+  matched_test_suffix = Search(_TEST_FILE_SUFFIX, fileinfo.BaseName())
+  if matched_test_suffix:
+    filename_cc = filename_cc[:-len(matched_test_suffix.group(1))]
   filename_cc = filename_cc.replace('/public/', '/')
   filename_cc = filename_cc.replace('/internal/', '/')
 
@@ -4342,16 +5468,16 @@
   return files_belong_to_same_module, common_path
 
 
-def UpdateIncludeState(filename, include_state, io=codecs):
-  """Fill up the include_state with new includes found from the file.
+def UpdateIncludeState(filename, include_dict, io=codecs):
+  """Fill up the include_dict with new includes found from the file.
 
   Args:
     filename: the name of the header to read.
-    include_state: an _IncludeState instance in which the headers are inserted.
+    include_dict: a dictionary in which the headers are inserted.
     io: The io factory to use to read the file. Provided for testability.
 
   Returns:
-    True if a header was succesfully added. False otherwise.
+    True if a header was successfully added. False otherwise.
   """
   headerfile = None
   try:
@@ -4365,9 +5491,7 @@
     match = _RE_PATTERN_INCLUDE.search(clean_line)
     if match:
       include = match.group(2)
-      # The value formatting is cute, but not really used right now.
-      # What matters here is that the key is in include_state.
-      include_state.setdefault(include, '%s:%d' % (filename, linenum))
+      include_dict.setdefault(include, linenum)
   return True
 
 
@@ -4406,7 +5530,7 @@
       if prefix.endswith('std::') or not prefix.endswith('::'):
         required['<string>'] = (linenum, 'string')
 
-    for pattern, template, header in _re_pattern_algorithm_header:
+    for pattern, template, header in _re_pattern_headers_maybe_templates:
       if pattern.search(line):
         required[header] = (linenum, template)
 
@@ -4415,15 +5539,21 @@
       continue
 
     for pattern, template, header in _re_pattern_templates:
-      if pattern.search(line):
-        required[header] = (linenum, template)
+      matched = pattern.search(line)
+      if matched:
+        # Don't warn about IWYU in non-STL namespaces:
+        # (We check only the first match per line; good enough.)
+        prefix = line[:matched.start()]
+        if prefix.endswith('std::') or not prefix.endswith('::'):
+          required[header] = (linenum, template)
 
   # The policy is that if you #include something in foo.h you don't need to
   # include it again in foo.cc. Here, we will look at possible includes.
-  # Let's copy the include_state so it is only messed up within this function.
-  include_state = include_state.copy()
+  # Let's flatten the include_state include_list and copy it into a dictionary.
+  include_dict = dict([item for sublist in include_state.include_list
+                       for item in sublist])
 
-  # Did we find the header for this file (if any) and succesfully load it?
+  # Did we find the header for this file (if any) and successfully load it?
   header_found = False
 
   # Use the absolute path so that matching works properly.
@@ -4438,13 +5568,13 @@
   # instead of 'foo_flymake.h'
   abs_filename = re.sub(r'_flymake\.cc$', '.cc', abs_filename)
 
-  # include_state is modified during iteration, so we iterate over a copy of
+  # include_dict is modified during iteration, so we iterate over a copy of
   # the keys.
-  header_keys = include_state.keys()
+  header_keys = include_dict.keys()
   for header in header_keys:
     (same_module, common_path) = FilesBelongToSameModule(abs_filename, header)
     fullpath = common_path + header
-    if same_module and UpdateIncludeState(fullpath, include_state, io):
+    if same_module and UpdateIncludeState(fullpath, include_dict, io):
       header_found = True
 
   # If we can't find the header file for a .cc, assume it's because we don't
@@ -4458,7 +5588,7 @@
   # All the lines have been processed, report the errors found.
   for required_header_unstripped in required:
     template = required[required_header_unstripped][1]
-    if required_header_unstripped.strip('<>"') not in include_state:
+    if required_header_unstripped.strip('<>"') not in include_dict:
       error(filename, required[required_header_unstripped][0],
             'build/include_what_you_use', 4,
             'Add #include ' + required_header_unstripped + ' for ' + template)
@@ -4470,7 +5600,7 @@
 def CheckMakePairUsesDeduction(filename, clean_lines, linenum, error):
   """Check that make_pair's template arguments are deduced.
 
-  G++ 4.6 in C++0x mode fails badly if make_pair's template arguments are
+  G++ 4.6 in C++11 mode fails badly if make_pair's template arguments are
   specified explicitly, and such use isn't intended in any case.
 
   Args:
@@ -4488,6 +5618,165 @@
           ' OR use pair directly OR if appropriate, construct a pair directly')
 
 
+def CheckRedundantVirtual(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "virtual" function-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for "virtual" on current line.
+  line = clean_lines.elided[linenum]
+  virtual = Match(r'^(.*)(\bvirtual\b)(.*)$', line)
+  if not virtual: return
+
+  # Ignore "virtual" keywords that are near access-specifiers.  These
+  # are only used in class base-specifier and do not apply to member
+  # functions.
+  if (Search(r'\b(public|protected|private)\s+$', virtual.group(1)) or
+      Match(r'^\s+(public|protected|private)\b', virtual.group(3))):
+    return
+
+  # Ignore the "virtual" keyword from virtual base classes.  Usually
+  # there is a column on the same line in these cases (virtual base
+  # classes are rare in google3 because multiple inheritance is rare).
+  if Match(r'^.*[^:]:[^:].*$', line): return
+
+  # Look for the next opening parenthesis.  This is the start of the
+  # parameter list (possibly on the next line shortly after virtual).
+  # TODO(unknown): doesn't work if there are virtual functions with
+  # decltype() or other things that use parentheses, but csearch suggests
+  # that this is rare.
+  end_col = -1
+  end_line = -1
+  start_col = len(virtual.group(2))
+  for start_line in xrange(linenum, min(linenum + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[start_line][start_col:]
+    parameter_list = Match(r'^([^(]*)\(', line)
+    if parameter_list:
+      # Match parentheses to find the end of the parameter list
+      (_, end_line, end_col) = CloseExpression(
+          clean_lines, start_line, start_col + len(parameter_list.group(1)))
+      break
+    start_col = 0
+
+  if end_col < 0:
+    return  # Couldn't find end of parameter list, give up
+
+  # Look for "override" or "final" after the parameter list
+  # (possibly on the next few lines).
+  for i in xrange(end_line, min(end_line + 3, clean_lines.NumLines())):
+    line = clean_lines.elided[i][end_col:]
+    match = Search(r'\b(override|final)\b', line)
+    if match:
+      error(filename, linenum, 'readability/inheritance', 4,
+            ('"virtual" is redundant since function is '
+             'already declared as "%s"' % match.group(1)))
+
+    # Set end_col to check whole lines after we are done with the
+    # first line.
+    end_col = 0
+    if Search(r'[^\w]\s*$', line):
+      break
+
+
+def CheckRedundantOverrideOrFinal(filename, clean_lines, linenum, error):
+  """Check if line contains a redundant "override" or "final" virt-specifier.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  # Look for closing parenthesis nearby.  We need one to confirm where
+  # the declarator ends and where the virt-specifier starts to avoid
+  # false positives.
+  line = clean_lines.elided[linenum]
+  declarator_end = line.rfind(')')
+  if declarator_end >= 0:
+    fragment = line[declarator_end:]
+  else:
+    if linenum > 1 and clean_lines.elided[linenum - 1].rfind(')') >= 0:
+      fragment = line
+    else:
+      return
+
+  # Check that at most one of "override" or "final" is present, not both
+  if Search(r'\boverride\b', fragment) and Search(r'\bfinal\b', fragment):
+    error(filename, linenum, 'readability/inheritance', 4,
+          ('"override" is redundant since function is '
+           'already declared as "final"'))
+
+
+
+
+# Returns true if we are at a new block, and it is directly
+# inside of a namespace.
+def IsBlockInNameSpace(nesting_state, is_forward_declaration):
+  """Checks that the new block is directly in a namespace.
+
+  Args:
+    nesting_state: The _NestingState object that contains info about our state.
+    is_forward_declaration: If the class is a forward declared class.
+  Returns:
+    Whether or not the new block is directly in a namespace.
+  """
+  if is_forward_declaration:
+    if len(nesting_state.stack) >= 1 and (
+        isinstance(nesting_state.stack[-1], _NamespaceInfo)):
+      return True
+    else:
+      return False
+
+  return (len(nesting_state.stack) > 1 and
+          nesting_state.stack[-1].check_namespace_indentation and
+          isinstance(nesting_state.stack[-2], _NamespaceInfo))
+
+
+def ShouldCheckNamespaceIndentation(nesting_state, is_namespace_indent_item,
+                                    raw_lines_no_comments, linenum):
+  """This method determines if we should apply our namespace indentation check.
+
+  Args:
+    nesting_state: The current nesting state.
+    is_namespace_indent_item: If we just put a new class on the stack, True.
+      If the top of the stack is not a class, or we did not recently
+      add the class, False.
+    raw_lines_no_comments: The lines without the comments.
+    linenum: The current line number we are processing.
+
+  Returns:
+    True if we should apply our namespace indentation check. Currently, it
+    only works for classes and namespaces inside of a namespace.
+  """
+
+  is_forward_declaration = IsForwardClassDeclaration(raw_lines_no_comments,
+                                                     linenum)
+
+  if not (is_namespace_indent_item or is_forward_declaration):
+    return False
+
+  # If we are in a macro, we do not want to check the namespace indentation.
+  if IsMacroDefinition(raw_lines_no_comments, linenum):
+    return False
+
+  return IsBlockInNameSpace(nesting_state, is_forward_declaration)
+
+
+# Call this method if the line is directly inside of a namespace.
+# If the line above is blank (excluding comments) or the start of
+# an inner namespace, it cannot be indented.
+def CheckItemIndentationInNamespace(filename, raw_lines_no_comments, linenum,
+                                    error):
+  line = raw_lines_no_comments[linenum]
+  if Match(r'^\s+', line):
+    error(filename, linenum, 'runtime/indentation_namespace', 4,
+          'Do not indent within a namespace')
+
+
 def ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions=[]):
@@ -4501,7 +5790,7 @@
     line: Number of line being processed.
     include_state: An _IncludeState instance in which the headers are inserted.
     function_state: A _FunctionState instance which counts function lines, etc.
-    nesting_state: A _NestingState instance which maintains information about
+    nesting_state: A NestingState instance which maintains information about
                    the current stack of nested blocks being parsed.
     error: A callable to which errors are reported, which takes 4 arguments:
            filename, line number, error level, and message
@@ -4512,8 +5801,9 @@
   raw_lines = clean_lines.raw_lines
   ParseNolintSuppressions(filename, raw_lines[line], line, error)
   nesting_state.Update(filename, clean_lines, line, error)
-  if nesting_state.stack and nesting_state.stack[-1].inline_asm != _NO_ASM:
-    return
+  CheckForNamespaceIndentation(filename, nesting_state, clean_lines, line,
+                               error)
+  if nesting_state.InAsmBlock(): return
   CheckForFunctionLengths(filename, clean_lines, line, function_state, error)
   CheckForMultilineCommentsAndStrings(filename, clean_lines, line, error)
   CheckStyle(filename, clean_lines, line, file_extension, nesting_state, error)
@@ -4526,9 +5816,82 @@
   CheckPosixThreading(filename, clean_lines, line, error)
   CheckInvalidIncrement(filename, clean_lines, line, error)
   CheckMakePairUsesDeduction(filename, clean_lines, line, error)
+  CheckRedundantVirtual(filename, clean_lines, line, error)
+  CheckRedundantOverrideOrFinal(filename, clean_lines, line, error)
   for check_fn in extra_check_functions:
     check_fn(filename, clean_lines, line, error)
 
+def FlagCxx11Features(filename, clean_lines, linenum, error):
+  """Flag those c++11 features that we only allow in certain places.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++ TR1 headers.
+  if include and include.group(1).startswith('tr1/'):
+    error(filename, linenum, 'build/c++tr1', 5,
+          ('C++ TR1 headers such as <%s> are unapproved.') % include.group(1))
+
+  # Flag unapproved C++11 headers.
+  if include and include.group(1) in ('cfenv',
+                                      'condition_variable',
+                                      'fenv.h',
+                                      'future',
+                                      'mutex',
+                                      'thread',
+                                      'chrono',
+                                      'ratio',
+                                      'regex',
+                                      'system_error',
+                                     ):
+    error(filename, linenum, 'build/c++11', 5,
+          ('<%s> is an unapproved C++11 header.') % include.group(1))
+
+  # The only place where we need to worry about C++11 keywords and library
+  # features in preprocessor directives is in macro definitions.
+  if Match(r'\s*#', line) and not Match(r'\s*#\s*define\b', line): return
+
+  # These are classes and free functions.  The classes are always
+  # mentioned as std::*, but we only catch the free functions if
+  # they're not found by ADL.  They're alphabetical by header.
+  for top_name in (
+      # type_traits
+      'alignment_of',
+      'aligned_union',
+      ):
+    if Search(r'\bstd::%s\b' % top_name, line):
+      error(filename, linenum, 'build/c++11', 5,
+            ('std::%s is an unapproved C++11 class or function.  Send c-style '
+             'an example of where it would make your code more readable, and '
+             'they may let you use it.') % top_name)
+
+
+def FlagCxx14Features(filename, clean_lines, linenum, error):
+  """Flag those C++14 features that we restrict.
+
+  Args:
+    filename: The name of the current file.
+    clean_lines: A CleansedLines instance containing the file.
+    linenum: The number of the line to check.
+    error: The function to call with any errors found.
+  """
+  line = clean_lines.elided[linenum]
+
+  include = Match(r'\s*#\s*include\s+[<"]([^<"]+)[">]', line)
+
+  # Flag unapproved C++14 headers.
+  if include and include.group(1) in ('scoped_allocator', 'shared_mutex'):
+    error(filename, linenum, 'build/c++14', 5,
+          ('<%s> is an unapproved C++14 header.') % include.group(1))
+
+
 def ProcessFileData(filename, file_extension, lines, error,
                     extra_check_functions=[]):
   """Performs lint checks and reports any errors to the given error function.
@@ -4549,31 +5912,122 @@
 
   include_state = _IncludeState()
   function_state = _FunctionState()
-  nesting_state = _NestingState()
+  nesting_state = NestingState()
 
   ResetNolintSuppressions()
 
   CheckForCopyright(filename, lines, error)
-
-  if file_extension == 'h':
-    CheckForHeaderGuard(filename, lines, error)
-
+  ProcessGlobalSuppresions(lines)
   RemoveMultiLineComments(filename, lines, error)
   clean_lines = CleansedLines(lines)
+
+  if IsHeaderExtension(file_extension):
+    CheckForHeaderGuard(filename, clean_lines, error)
+
   for line in xrange(clean_lines.NumLines()):
     ProcessLine(filename, file_extension, clean_lines, line,
                 include_state, function_state, nesting_state, error,
                 extra_check_functions)
+    FlagCxx11Features(filename, clean_lines, line, error)
   nesting_state.CheckCompletedBlocks(filename, error)
 
   CheckForIncludeWhatYouUse(filename, clean_lines, include_state, error)
 
+  # Check that the .cc file has included its header if it exists.
+  if _IsSourceExtension(file_extension):
+    CheckHeaderFileIncluded(filename, include_state, error)
+
   # We check here rather than inside ProcessLine so that we see raw
   # lines rather than "cleaned" lines.
   CheckForBadCharacters(filename, lines, error)
 
   CheckForNewlineAtEOF(filename, lines, error)
 
+def ProcessConfigOverrides(filename):
+  """ Loads the configuration files and processes the config overrides.
+
+  Args:
+    filename: The name of the file being processed by the linter.
+
+  Returns:
+    False if the current |filename| should not be processed further.
+  """
+
+  abs_filename = os.path.abspath(filename)
+  cfg_filters = []
+  keep_looking = True
+  while keep_looking:
+    abs_path, base_name = os.path.split(abs_filename)
+    if not base_name:
+      break  # Reached the root directory.
+
+    cfg_file = os.path.join(abs_path, "CPPLINT.cfg")
+    abs_filename = abs_path
+    if not os.path.isfile(cfg_file):
+      continue
+
+    try:
+      with open(cfg_file) as file_handle:
+        for line in file_handle:
+          line, _, _ = line.partition('#')  # Remove comments.
+          if not line.strip():
+            continue
+
+          name, _, val = line.partition('=')
+          name = name.strip()
+          val = val.strip()
+          if name == 'set noparent':
+            keep_looking = False
+          elif name == 'filter':
+            cfg_filters.append(val)
+          elif name == 'exclude_files':
+            # When matching exclude_files pattern, use the base_name of
+            # the current file name or the directory name we are processing.
+            # For example, if we are checking for lint errors in /foo/bar/baz.cc
+            # and we found the .cfg file at /foo/CPPLINT.cfg, then the config
+            # file's "exclude_files" filter is meant to be checked against "bar"
+            # and not "baz" nor "bar/baz.cc".
+            if base_name:
+              pattern = re.compile(val)
+              if pattern.match(base_name):
+                if _cpplint_state.quiet:
+                  # Suppress "Ignoring file" warning when using --quiet.
+                  return False
+                sys.stderr.write('Ignoring "%s": file excluded by "%s". '
+                                 'File path component "%s" matches '
+                                 'pattern "%s"\n' %
+                                 (filename, cfg_file, base_name, val))
+                return False
+          elif name == 'linelength':
+            global _line_length
+            try:
+                _line_length = int(val)
+            except ValueError:
+                sys.stderr.write('Line length must be numeric.')
+          elif name == 'root':
+            global _root
+            # root directories are specified relative to CPPLINT.cfg dir.
+            _root = os.path.join(os.path.dirname(cfg_file), val)
+          elif name == 'headers':
+            ProcessHppHeadersOption(val)
+          else:
+            sys.stderr.write(
+                'Invalid configuration option (%s) in file %s\n' %
+                (name, cfg_file))
+
+    except IOError:
+      sys.stderr.write(
+          "Skipping config file '%s': Can't open for reading\n" % cfg_file)
+      keep_looking = False
+
+  # Apply all the accumulated filters in reverse order (top-level directory
+  # config options having the least priority).
+  for filter in reversed(cfg_filters):
+     _AddFilters(filter)
+
+  return True
+
+
 def ProcessFile(filename, vlevel, extra_check_functions=[]):
   """Does google-lint on a single file.
 
@@ -4589,7 +6043,15 @@
   """
 
   _SetVerboseLevel(vlevel)
+  _BackupFilters()
+  old_errors = _cpplint_state.error_count
 
+  if not ProcessConfigOverrides(filename):
+    _RestoreFilters()
+    return
+
+  lf_lines = []
+  crlf_lines = []
   try:
     # Support the UNIX convention of using "-" for stdin.  Note that
     # we are not opening the file with universal newline support
@@ -4597,10 +6059,7 @@
     # contain trailing '\r' characters if we are reading a file that
     # has CRLF endings.
     # If after the split a trailing '\r' is present, it is removed
-    # below. If it is not expected to be present (i.e. os.linesep !=
-    # '\r\n' as in Windows), a warning is issued below if this file
-    # is processed.
-
+    # below.
     if filename == '-':
       lines = codecs.StreamReaderWriter(sys.stdin,
                                         codecs.getreader('utf8'),
@@ -4609,16 +6068,19 @@
     else:
       lines = codecs.open(filename, 'r', 'utf8', 'replace').read().split('\n')
 
-    carriage_return_found = False
     # Remove trailing '\r'.
-    for linenum in range(len(lines)):
+    # The -1 accounts for the extra trailing blank line we get from split()
+    for linenum in range(len(lines) - 1):
       if lines[linenum].endswith('\r'):
         lines[linenum] = lines[linenum].rstrip('\r')
-        carriage_return_found = True
+        crlf_lines.append(linenum + 1)
+      else:
+        lf_lines.append(linenum + 1)
 
   except IOError:
     sys.stderr.write(
         "Skipping input '%s': Can't open for reading\n" % filename)
+    _RestoreFilters()
     return
 
   # Note, if no dot is found, this will give the entire filename as the ext.
@@ -4632,14 +6094,30 @@
   else:
     ProcessFileData(filename, file_extension, lines, Error,
                     extra_check_functions)
-    if carriage_return_found and os.linesep != '\r\n':
-      # Use 0 for linenum since outputting only one error for potentially
-      # several lines.
-      Error(filename, 0, 'whitespace/newline', 1,
-            'One or more unexpected \\r (^M) found;'
-            'better to use only a \\n')
 
-  sys.stderr.write('Done processing %s\n' % filename)
+    # If end-of-line sequences are a mix of LF and CR-LF, issue
+    # warnings on the lines with CR.
+    #
+    # Don't issue any warnings if all lines are uniformly LF or CR-LF,
+    # since critique can handle these just fine, and the style guide
+    # doesn't dictate a particular end of line sequence.
+    #
+    # We can't depend on os.linesep to determine what the desired
+    # end-of-line sequence should be, since that will return the
+    # server-side end-of-line sequence.
+    if lf_lines and crlf_lines:
+      # Warn on every line with CR.  An alternative approach might be to
+      # check whether the file is mostly CRLF or just LF, and warn on the
+      # minority, we bias toward LF here since most tools prefer LF.
+      for linenum in crlf_lines:
+        Error(filename, linenum, 'whitespace/newline', 1,
+              'Unexpected \\r (^M) found; better to use only \\n')
+
+  # Suppress printing anything if --quiet was passed unless the error
+  # count has increased after processing this file.
+  if not _cpplint_state.quiet or old_errors != _cpplint_state.error_count:
+    sys.stdout.write('Done processing %s\n' % filename)
+  _RestoreFilters()
 
 
 def PrintUsage(message):
@@ -4681,13 +6159,16 @@
                                                  'filter=',
                                                  'root=',
                                                  'linelength=',
-                                                 'extensions='])
+                                                 'extensions=',
+                                                 'headers=',
+                                                 'quiet'])
   except getopt.GetoptError:
     PrintUsage('Invalid arguments.')
 
   verbosity = _VerboseLevel()
   output_format = _OutputFormat()
   filters = ''
+  quiet = _Quiet()
   counting_style = ''
 
   for (opt, val) in opts:
@@ -4697,6 +6178,8 @@
       if val not in ('emacs', 'vs7', 'eclipse'):
         PrintUsage('The only allowed output formats are emacs, vs7 and eclipse.')
       output_format = val
+    elif opt == '--quiet':
+      quiet = True
     elif opt == '--verbose':
       verbosity = int(val)
     elif opt == '--filter':
@@ -4721,12 +6204,15 @@
       try:
           _valid_extensions = set(val.split(','))
       except ValueError:
-          PrintUsage('Extensions must be comma seperated list.')
+          PrintUsage('Extensions must be comma separated list.')
+    elif opt == '--headers':
+      ProcessHppHeadersOption(val)
 
   if not filenames:
     PrintUsage('No files were specified.')
 
   _SetOutputFormat(output_format)
+  _SetQuiet(quiet)
   _SetVerboseLevel(verbosity)
   _SetFilters(filters)
   _SetCountingStyle(counting_style)
@@ -4747,7 +6233,9 @@
   _cpplint_state.ResetErrorCounts()
   for filename in filenames:
     ProcessFile(filename, _cpplint_state.verbose_level)
-  _cpplint_state.PrintErrorCounts()
+  # If --quiet is passed, suppress printing error count unless there are errors.
+  if not _cpplint_state.quiet or _cpplint_state.error_count > 0:
+    _cpplint_state.PrintErrorCounts()
 
   sys.exit(_cpplint_state.error_count > 0)
 
diff --git a/libvpx/tools/diff.py b/libvpx/tools/diff.py
index a96c7db..860a6b0 100644
--- a/libvpx/tools/diff.py
+++ b/libvpx/tools/diff.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/libvpx/tools/intersect-diffs.py b/libvpx/tools/intersect-diffs.py
index 4dbafa9..590e687 100755
--- a/libvpx/tools/intersect-diffs.py
+++ b/libvpx/tools/intersect-diffs.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -69,7 +69,7 @@
                 break
 
     if out_hunks:
-        print FormatDiffHunks(out_hunks)
+        print(FormatDiffHunks(out_hunks))
         sys.exit(1)
 
 if __name__ == "__main__":
diff --git a/libvpx/tools/lint-hunks.py b/libvpx/tools/lint-hunks.py
index 30d3249..0a94afe 100755
--- a/libvpx/tools/lint-hunks.py
+++ b/libvpx/tools/lint-hunks.py
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
@@ -10,7 +10,7 @@
 """Performs style checking on each diff hunk."""
 import getopt
 import os
-import StringIO
+import io
 import subprocess
 import sys
 
@@ -63,21 +63,21 @@
     try:
         try:
             opts, args = getopt.getopt(argv[1:], SHORT_OPTIONS, LONG_OPTIONS)
-        except getopt.error, msg:
+        except getopt.error as msg:
             raise Usage(msg)
 
         # process options
         for o, _ in opts:
             if o in ("-h", "--help"):
-                print __doc__
+                print(__doc__)
                 sys.exit(0)
 
         if args and len(args) > 1:
-            print __doc__
+            print(__doc__)
             sys.exit(0)
 
         # Find the fully qualified path to the root of the tree
-        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE)
+        tl = Subprocess(TOPLEVEL_CMD, stdout=subprocess.PIPE, text=True)
         tl = tl.communicate()[0].strip()
 
         # See if we're working on the index or not.
@@ -93,9 +93,9 @@
 
         # Get a list of all affected lines
         file_affected_line_map = {}
-        p = Subprocess(diff_cmd, stdout=subprocess.PIPE)
+        p = Subprocess(diff_cmd, stdout=subprocess.PIPE, text=True)
         stdout = p.communicate()[0]
-        for hunk in diff.ParseDiffHunks(StringIO.StringIO(stdout)):
+        for hunk in diff.ParseDiffHunks(io.StringIO(stdout)):
             filename = hunk.right.filename[2:]
             if filename not in file_affected_line_map:
                 file_affected_line_map[filename] = set()
@@ -103,7 +103,7 @@
 
         # Run each affected file through cpplint
         lint_failed = False
-        for filename, affected_lines in file_affected_line_map.iteritems():
+        for filename, affected_lines in file_affected_line_map.items():
             if filename.split(".")[-1] not in ("c", "h", "cc"):
                 continue
             if filename.startswith("third_party"):
@@ -112,14 +112,16 @@
             if args:
                 # File contents come from git
                 show_cmd = SHOW_CMD + [args[0] + ":" + filename]
-                show = Subprocess(show_cmd, stdout=subprocess.PIPE)
+                show = Subprocess(show_cmd, stdout=subprocess.PIPE, text=True)
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=show.stdout, stderr=subprocess.PIPE)
+                                  stdin=show.stdout, stderr=subprocess.PIPE,
+                                  text=True)
                 lint_out = lint.communicate()[1]
             else:
                 # File contents come from the working tree
                 lint = Subprocess(cpplint_cmd, expected_returncode=(0, 1),
-                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE)
+                                  stdin=subprocess.PIPE, stderr=subprocess.PIPE,
+                                  text=True)
                 stdin = open(os.path.join(tl, filename)).read()
                 lint_out = lint.communicate(stdin)[1]
 
@@ -129,17 +131,17 @@
                     continue
                 warning_line_num = int(fields[1])
                 if warning_line_num in affected_lines:
-                    print "%s:%d:%s"%(filename, warning_line_num,
-                                      ":".join(fields[2:]))
+                    print("%s:%d:%s"%(filename, warning_line_num,
+                                      ":".join(fields[2:])))
                     lint_failed = True
 
         # Set exit code if any relevant lint errors seen
         if lint_failed:
             return 1
 
-    except Usage, err:
-        print >>sys.stderr, err
-        print >>sys.stderr, "for help use --help"
+    except Usage as err:
+        print(err, file=sys.stderr)
+        print("for help use --help", file=sys.stderr)
         return 2
 
 if __name__ == "__main__":
diff --git a/libvpx/tools/tiny_ssim.c b/libvpx/tools/tiny_ssim.c
index ff4634a..8fba814 100644
--- a/libvpx/tools/tiny_ssim.c
+++ b/libvpx/tools/tiny_ssim.c
@@ -425,20 +425,24 @@
       break;
     }
 #if CONFIG_VP9_HIGHBITDEPTH
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                            \
-  if (bit_depth < 9) {                                                         \
-    ssim = ssim2(buf0, buf1, w, w, w, h);                                      \
-    psnr = calc_plane_error(buf0, w, buf1, w, w, h);                           \
-  } else {                                                                     \
-    ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), w, \
-                        w, w, h, bit_depth);                                   \
-    psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                       \
-                              CAST_TO_SHORTPTR(buf1), w, w, h);                \
-  }
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)                           \
+  do {                                                                        \
+    if (bit_depth < 9) {                                                      \
+      ssim = ssim2(buf0, buf1, w, w, w, h);                                   \
+      psnr = calc_plane_error(buf0, w, buf1, w, w, h);                        \
+    } else {                                                                  \
+      ssim = highbd_ssim2(CONVERT_TO_BYTEPTR(buf0), CONVERT_TO_BYTEPTR(buf1), \
+                          w, w, w, h, bit_depth);                             \
+      psnr = calc_plane_error16(CAST_TO_SHORTPTR(buf0), w,                    \
+                                CAST_TO_SHORTPTR(buf1), w, w, h);             \
+    }                                                                         \
+  } while (0)
 #else
-#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h) \
-  ssim = ssim2(buf0, buf1, w, w, w, h);             \
-  psnr = calc_plane_error(buf0, w, buf1, w, w, h);
+#define psnr_and_ssim(ssim, psnr, buf0, buf1, w, h)  \
+  do {                                               \
+    ssim = ssim2(buf0, buf1, w, w, w, h);            \
+    psnr = calc_plane_error(buf0, w, buf1, w, w, h); \
+  } while (0)
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
     if (n_frames == allocated_frames) {
@@ -449,6 +453,10 @@
       psnry = realloc(psnry, allocated_frames * sizeof(*psnry));
       psnru = realloc(psnru, allocated_frames * sizeof(*psnru));
       psnrv = realloc(psnrv, allocated_frames * sizeof(*psnrv));
+      if (!(ssimy && ssimu && ssimv && psnry && psnru && psnrv)) {
+        fprintf(stderr, "Error allocating SSIM/PSNR data.\n");
+        exit(EXIT_FAILURE);
+      }
     }
     psnr_and_ssim(ssimy[n_frames], psnry[n_frames], y[0], y[1], w, h);
     psnr_and_ssim(ssimu[n_frames], psnru[n_frames], u[0], u[1], (w + 1) / 2,
diff --git a/libvpx/tools/wrap-commit-msg.py b/libvpx/tools/wrap-commit-msg.py
index d5b4b04..ba3fa58 100755
--- a/libvpx/tools/wrap-commit-msg.py
+++ b/libvpx/tools/wrap-commit-msg.py
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/env python3
 ##  Copyright (c) 2012 The WebM project authors. All Rights Reserved.
 ##
 ##  Use of this source code is governed by a BSD-style license
diff --git a/libvpx/tools_common.h b/libvpx/tools_common.h
index 4e8851f..b9cfb9c 100644
--- a/libvpx/tools_common.h
+++ b/libvpx/tools_common.h
@@ -116,12 +116,24 @@
 #define VPX_NO_RETURN
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef VPX_TOOLS_FORMAT_PRINTF
+#define VPX_TOOLS_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 /* Sets a stdio stream into binary mode */
 FILE *set_binary_mode(FILE *stream);
 
-VPX_NO_RETURN void die(const char *fmt, ...);
-VPX_NO_RETURN void fatal(const char *fmt, ...);
-void warn(const char *fmt, ...);
+VPX_NO_RETURN void die(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+VPX_NO_RETURN void fatal(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
+void warn(const char *fmt, ...) VPX_TOOLS_FORMAT_PRINTF(1, 2);
 
 VPX_NO_RETURN void die_codec(vpx_codec_ctx_t *ctx, const char *s);
 
diff --git a/libvpx/vp8/common/blockd.h b/libvpx/vp8/common/blockd.h
index 02abe05..4054434 100644
--- a/libvpx/vp8/common/blockd.h
+++ b/libvpx/vp8/common/blockd.h
@@ -58,7 +58,7 @@
 extern const unsigned char vp8_block2left[25];
 extern const unsigned char vp8_block2above[25];
 
-#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B);
+#define VP8_COMBINEENTROPYCONTEXTS(Dest, A, B) Dest = (A) + (B)
 
 typedef enum { KEY_FRAME = 0, INTER_FRAME = 1 } FRAME_TYPE;
 
diff --git a/libvpx/vp8/common/common.h b/libvpx/vp8/common/common.h
index 2c30e8d..562569f 100644
--- a/libvpx/vp8/common/common.h
+++ b/libvpx/vp8/common/common.h
@@ -24,22 +24,22 @@
 /* Only need this for fixed-size arrays, for structs just assign. */
 
 #define vp8_copy(Dest, Src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(Dest) == sizeof(Src)); \
     memcpy(Dest, Src, sizeof(Src));      \
-  }
+  } while (0)
 
 /* Use this for variably-sized arrays. */
 
 #define vp8_copy_array(Dest, Src, N)           \
-  {                                            \
+  do {                                         \
     assert(sizeof(*(Dest)) == sizeof(*(Src))); \
     memcpy(Dest, Src, (N) * sizeof(*(Src)));   \
-  }
+  } while (0)
 
-#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest));
+#define vp8_zero(Dest) memset(&(Dest), 0, sizeof(Dest))
 
-#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)));
+#define vp8_zero_array(Dest, N) memset(Dest, 0, (N) * sizeof(*(Dest)))
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vp8/common/generic/systemdependent.c b/libvpx/vp8/common/generic/systemdependent.c
index cd1b02c..71529bd 100644
--- a/libvpx/vp8/common/generic/systemdependent.c
+++ b/libvpx/vp8/common/generic/systemdependent.c
@@ -18,6 +18,8 @@
 #include "vpx_ports/ppc.h"
 #elif VPX_ARCH_MIPS
 #include "vpx_ports/mips.h"
+#elif VPX_ARCH_LOONGARCH
+#include "vpx_ports/loongarch.h"
 #endif
 #include "vp8/common/onyxc_int.h"
 #include "vp8/common/systemdependent.h"
@@ -100,6 +102,8 @@
   ctx->cpu_caps = ppc_simd_caps();
 #elif VPX_ARCH_MIPS
   ctx->cpu_caps = mips_cpu_caps();
+#elif VPX_ARCH_LOONGARCH
+  ctx->cpu_caps = loongarch_cpu_caps();
 #else
   // generic-gnu targets.
   ctx->cpu_caps = 0;
diff --git a/libvpx/vp8/common/loongarch/idct_lsx.c b/libvpx/vp8/common/loongarch/idct_lsx.c
new file mode 100644
index 0000000..eee871e
--- /dev/null
+++ b/libvpx/vp8/common/loongarch/idct_lsx.c
@@ -0,0 +1,322 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/blockd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static const int32_t cospi8sqrt2minus1 = 20091;
+static const int32_t sinpi8sqrt2 = 35468;
+
+#define TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, out0, out1, out2, out3)    \
+  do {                                                                    \
+    __m128i tmp0_m, tmp1_m, tmp2_m, tmp3_m;                               \
+                                                                          \
+    DUP2_ARG2(__lsx_vilvl_h, in1, in0, in3, in2, tmp0_m, tmp1_m);         \
+    DUP2_ARG2(__lsx_vilvh_h, in1, in0, in3, in2, tmp2_m, tmp3_m);         \
+    DUP2_ARG2(__lsx_vilvl_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out0, out2); \
+    DUP2_ARG2(__lsx_vilvh_w, tmp1_m, tmp0_m, tmp3_m, tmp2_m, out1, out3); \
+  } while (0)
+
+#define TRANSPOSE_TWO_4x4_H(in0, in1, in2, in3, out0, out1, out2, out3) \
+  do {                                                                  \
+    __m128i s4_m, s5_m, s6_m, s7_m;                                     \
+                                                                        \
+    TRANSPOSE8X4_SH_SH(in0, in1, in2, in3, s4_m, s5_m, s6_m, s7_m);     \
+    DUP2_ARG2(__lsx_vilvl_d, s6_m, s4_m, s7_m, s5_m, out0, out2);       \
+    out1 = __lsx_vilvh_d(s6_m, s4_m);                                   \
+    out3 = __lsx_vilvh_d(s7_m, s5_m);                                   \
+  } while (0)
+
+#define EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in0, in1)   \
+  do {                                                        \
+    __m128i zero_m = __lsx_vldi(0);                           \
+    __m128i tmp1_m, tmp2_m;                                   \
+    __m128i sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2); \
+                                                              \
+    tmp1_m = __lsx_vilvl_h(in0, zero_m);                      \
+    tmp2_m = __lsx_vilvh_h(in0, zero_m);                      \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    tmp1_m = __lsx_vmul_w(tmp1_m, sinpi8_sqrt2_m);            \
+    tmp1_m = __lsx_vsrai_w(tmp1_m, 16);                       \
+    tmp2_m = __lsx_vmul_w(tmp2_m, sinpi8_sqrt2_m);            \
+    tmp2_m = __lsx_vsrai_w(tmp2_m, 16);                       \
+    in1 = __lsx_vpickev_h(tmp2_m, tmp1_m);                    \
+  } while (0)
+
+#define VP8_IDCT_1D_H(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m;                                        \
+    __m128i d_tmp1_m, d_tmp2_m;                                        \
+    __m128i const_cospi8sqrt2minus1_m;                                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_h(cospi8sqrt2minus1); \
+    a1_m = __lsx_vadd_h(in0, in2);                                     \
+    b1_m = __lsx_vsub_h(in0, in2);                                     \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in1, c_tmp1_m);          \
+                                                                       \
+    c_tmp2_m = __lsx_vmuh_h(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vslli_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vsrai_h(c_tmp2_m, 1);                             \
+    c_tmp2_m = __lsx_vadd_h(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_h(c_tmp1_m, c_tmp2_m);                           \
+                                                                       \
+    d_tmp1_m = __lsx_vmuh_h(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vslli_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vsrai_h(d_tmp1_m, 1);                             \
+    d_tmp1_m = __lsx_vadd_h(in1, d_tmp1_m);                            \
+    EXPAND_TO_H_MULTIPLY_SINPI8SQRT2_PCK_TO_W(in3, d_tmp2_m);          \
+    d1_m = __lsx_vadd_h(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_H(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define VP8_IDCT_1D_W(in0, in1, in2, in3, out0, out1, out2, out3)      \
+  do {                                                                 \
+    __m128i a1_m, b1_m, c1_m, d1_m;                                    \
+    __m128i c_tmp1_m, c_tmp2_m, d_tmp1_m, d_tmp2_m;                    \
+    __m128i const_cospi8sqrt2minus1_m, sinpi8_sqrt2_m;                 \
+                                                                       \
+    const_cospi8sqrt2minus1_m = __lsx_vreplgr2vr_w(cospi8sqrt2minus1); \
+    sinpi8_sqrt2_m = __lsx_vreplgr2vr_w(sinpi8sqrt2);                  \
+    a1_m = __lsx_vadd_w(in0, in2);                                     \
+    b1_m = __lsx_vsub_w(in0, in2);                                     \
+    c_tmp1_m = __lsx_vmul_w(in1, sinpi8_sqrt2_m);                      \
+    c_tmp1_m = __lsx_vsrai_w(c_tmp1_m, 16);                            \
+    c_tmp2_m = __lsx_vmul_w(in3, const_cospi8sqrt2minus1_m);           \
+    c_tmp2_m = __lsx_vsrai_w(c_tmp2_m, 16);                            \
+    c_tmp2_m = __lsx_vadd_w(in3, c_tmp2_m);                            \
+    c1_m = __lsx_vsub_w(c_tmp1_m, c_tmp2_m);                           \
+    d_tmp1_m = __lsx_vmul_w(in1, const_cospi8sqrt2minus1_m);           \
+    d_tmp1_m = __lsx_vsrai_w(d_tmp1_m, 16);                            \
+    d_tmp1_m = __lsx_vadd_w(in1, d_tmp1_m);                            \
+    d_tmp2_m = __lsx_vmul_w(in3, sinpi8_sqrt2_m);                      \
+    d_tmp2_m = __lsx_vsrai_w(d_tmp2_m, 16);                            \
+    d1_m = __lsx_vadd_w(d_tmp1_m, d_tmp2_m);                           \
+    LSX_BUTTERFLY_4_W(a1_m, b1_m, c1_m, d1_m, out0, out1, out2, out3); \
+  } while (0)
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void idct4x4_addconst_lsx(int16_t in_dc, uint8_t *pred,
+                                 int32_t pred_stride, uint8_t *dest,
+                                 int32_t dest_stride) {
+  __m128i vec, res0, res1, res2, res3, dst0, dst1;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+
+  vec = __lsx_vreplgr2vr_h(in_dc);
+  vec = __lsx_vsrari_h(vec, 3);
+  pred0 = __lsx_vld(pred, 0);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred1, pred2);
+  pred3 = __lsx_vldx(pred, pred_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, pred0, zero, pred1, zero, pred2, zero, pred3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, dst0, dst1);
+  dst0 = __lsx_vpickev_w(dst1, dst0);
+  __lsx_vstelm_w(dst0, dest, 0, 0);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 1);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 2);
+  dest += dest_stride;
+  __lsx_vstelm_w(dst0, dest, 0, 3);
+}
+
+void vp8_dc_only_idct_add_lsx(int16_t input_dc, uint8_t *pred_ptr,
+                              int32_t pred_stride, uint8_t *dst_ptr,
+                              int32_t dst_stride) {
+  idct4x4_addconst_lsx(input_dc, pred_ptr, pred_stride, dst_ptr, dst_stride);
+}
+
+static void dequant_idct4x4_addblk_2x_lsx(int16_t *input,
+                                          int16_t *dequant_input, uint8_t *dest,
+                                          int32_t dest_stride) {
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i in0, in1, in2, in3, mul0, mul1, mul2, mul3, dequant_in0, dequant_in1;
+  __m128i hz0, hz1, hz2, hz3, vt0, vt1, vt2, vt3, res0, res1, res2, res3;
+  __m128i hz0l, hz1l, hz2l, hz3l, hz0r, hz1r, hz2r, hz3r;
+  __m128i vt0l, vt1l, vt2l, vt3l, vt0r, vt1r, vt2r, vt3r;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP2_ARG2(__lsx_vld, dequant_input, 0, dequant_input, 16, dequant_in0,
+            dequant_in1);
+
+  DUP4_ARG2(__lsx_vmul_h, in0, dequant_in0, in1, dequant_in1, in2, dequant_in0,
+            in3, dequant_in1, mul0, mul1, mul2, mul3);
+  DUP2_ARG2(__lsx_vpickev_d, mul2, mul0, mul3, mul1, in0, in2);
+  DUP2_ARG2(__lsx_vpickod_d, mul2, mul0, mul3, mul1, in1, in3);
+
+  VP8_IDCT_1D_H(in0, in1, in2, in3, hz0, hz1, hz2, hz3);
+  TRANSPOSE_TWO_4x4_H(hz0, hz1, hz2, hz3, hz0, hz1, hz2, hz3);
+  UNPCK_SH_SW(hz0, hz0r, hz0l);
+  UNPCK_SH_SW(hz1, hz1r, hz1l);
+  UNPCK_SH_SW(hz2, hz2r, hz2l);
+  UNPCK_SH_SW(hz3, hz3r, hz3l);
+  VP8_IDCT_1D_W(hz0l, hz1l, hz2l, hz3l, vt0l, vt1l, vt2l, vt3l);
+  DUP4_ARG2(__lsx_vsrari_w, vt0l, 3, vt1l, 3, vt2l, 3, vt3l, 3, vt0l, vt1l,
+            vt2l, vt3l);
+  VP8_IDCT_1D_W(hz0r, hz1r, hz2r, hz3r, vt0r, vt1r, vt2r, vt3r);
+  DUP4_ARG2(__lsx_vsrari_w, vt0r, 3, vt1r, 3, vt2r, 3, vt3r, 3, vt0r, vt1r,
+            vt2r, vt3r);
+  DUP4_ARG2(__lsx_vpickev_h, vt0l, vt0r, vt1l, vt1r, vt2l, vt2r, vt3l, vt3r,
+            vt0, vt1, vt2, vt3);
+  TRANSPOSE_TWO_4x4_H(vt0, vt1, vt2, vt3, vt0, vt1, vt2, vt3);
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vt0, res1, vt1, res2, vt2, res3, vt3, res0,
+            res1, res2, res3);
+
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, vt0l, vt1l);
+
+  __lsx_vstelm_d(vt0l, dest, 0, 0);
+  __lsx_vstelm_d(vt0l, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(vt1l, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(vt1l, dest + dest_stride3, 0, 1);
+
+  __lsx_vst(zero, input, 0);
+  __lsx_vst(zero, input, 16);
+  __lsx_vst(zero, input, 32);
+  __lsx_vst(zero, input, 48);
+}
+
+static void dequant_idct_addconst_2x_lsx(int16_t *input, int16_t *dequant_input,
+                                         uint8_t *dest, int32_t dest_stride) {
+  __m128i input_dc0, input_dc1, vec, res0, res1, res2, res3;
+  __m128i dest0, dest1, dest2, dest3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t dest_stride2 = dest_stride << 1;
+  int32_t dest_stride3 = dest_stride2 + dest_stride;
+
+  input_dc0 = __lsx_vreplgr2vr_h(input[0] * dequant_input[0]);
+  input_dc1 = __lsx_vreplgr2vr_h(input[16] * dequant_input[0]);
+  DUP2_ARG2(__lsx_vsrari_h, input_dc0, 3, input_dc1, 3, input_dc0, input_dc1);
+  vec = __lsx_vpickev_d(input_dc1, input_dc0);
+  input[0] = 0;
+  input[16] = 0;
+  dest0 = __lsx_vld(dest, 0);
+  DUP2_ARG2(__lsx_vldx, dest, dest_stride, dest, dest_stride2, dest1, dest2);
+  dest3 = __lsx_vldx(dest, dest_stride3);
+  DUP4_ARG2(__lsx_vilvl_b, zero, dest0, zero, dest1, zero, dest2, zero, dest3,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+            res1, res2, res3);
+  res0 = __lsx_vclip255_h(res0);
+  res1 = __lsx_vclip255_h(res1);
+  res2 = __lsx_vclip255_h(res2);
+  res3 = __lsx_vclip255_h(res3);
+
+  DUP2_ARG2(__lsx_vpickev_b, res1, res0, res3, res2, res0, res1);
+  __lsx_vstelm_d(res0, dest, 0, 0);
+  __lsx_vstelm_d(res0, dest + dest_stride, 0, 1);
+  __lsx_vstelm_d(res1, dest + dest_stride2, 0, 0);
+  __lsx_vstelm_d(res1, dest + dest_stride3, 0, 1);
+}
+
+void vp8_dequant_idct_add_y_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst,
+                                      int32_t stride, char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  uint8_t i;
+
+  for (i = 4; i--;) {
+    if (eobs_h[0]) {
+      if (eobs_h[0] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst, stride);
+      }
+    }
+
+    q += 32;
+
+    if (eobs_h[1]) {
+      if (eobs_h[1] & 0xfefe) {
+        dequant_idct4x4_addblk_2x_lsx(q, dq, dst + 8, stride);
+      } else {
+        dequant_idct_addconst_2x_lsx(q, dq, dst + 8, stride);
+      }
+    }
+
+    q += 32;
+    dst += (4 * stride);
+    eobs_h += 2;
+  }
+}
+
+void vp8_dequant_idct_add_uv_block_lsx(int16_t *q, int16_t *dq, uint8_t *dst_u,
+                                       uint8_t *dst_v, int32_t stride,
+                                       char *eobs) {
+  int16_t *eobs_h = (int16_t *)eobs;
+  if (eobs_h[0]) {
+    if (eobs_h[0] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+  dst_u += (stride * 4);
+
+  if (eobs_h[1]) {
+    if (eobs_h[1] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_u, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_u, stride);
+    }
+  }
+
+  q += 32;
+
+  if (eobs_h[2]) {
+    if (eobs_h[2] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+  q += 32;
+  dst_v += (stride * 4);
+
+  if (eobs_h[3]) {
+    if (eobs_h[3] & 0xfefe) {
+      dequant_idct4x4_addblk_2x_lsx(q, dq, dst_v, stride);
+    } else {
+      dequant_idct_addconst_2x_lsx(q, dq, dst_v, stride);
+    }
+  }
+}
diff --git a/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c b/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
new file mode 100644
index 0000000..79c3ea6
--- /dev/null
+++ b/libvpx/vp8/common/loongarch/loopfilter_filters_lsx.c
@@ -0,0 +1,743 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS. All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/loopfilter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev)        \
+  do {                                                       \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2; \
+    const __m128i cnst4b = __lsx_vldi(4);                    \
+    const __m128i cnst3b = __lsx_vldi(3);                    \
+                                                             \
+    p1_m = __lsx_vxori_b(p1, 0x80);                          \
+    p0_m = __lsx_vxori_b(p0, 0x80);                          \
+    q0_m = __lsx_vxori_b(q0, 0x80);                          \
+    q1_m = __lsx_vxori_b(q1, 0x80);                          \
+                                                             \
+    filt = __lsx_vssub_b(p1_m, q1_m);                        \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                   \
+    filt = __lsx_vand_v(filt, mask);                         \
+    t1 = __lsx_vsadd_b(filt, cnst4b);                        \
+    t1 = __lsx_vsra_b(t1, cnst3b);                           \
+    t2 = __lsx_vsadd_b(filt, cnst3b);                        \
+    t2 = __lsx_vsra_b(t2, cnst3b);                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                          \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                          \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                          \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                          \
+    filt = __lsx_vsrari_b(t1, 1);                            \
+    hev = __lsx_vxori_b(hev, 0xff);                          \
+    filt = __lsx_vand_v(filt, hev);                          \
+    q1_m = __lsx_vssub_b(q1_m, filt);                        \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                          \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                        \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                          \
+  } while (0)
+
+#define VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev) \
+  do {                                                  \
+    __m128i p2_m, p1_m, p0_m, q2_m, q1_m, q0_m;         \
+    __m128i u, filt, t1, t2, filt_sign, q0_sub_p0;      \
+    __m128i filt_r, filt_l;                             \
+    __m128i temp0, temp1, temp2, temp3;                 \
+    const __m128i cnst4b = __lsx_vldi(4);               \
+    const __m128i cnst3b = __lsx_vldi(3);               \
+    const __m128i cnst9h = __lsx_vldi(1033);            \
+    const __m128i cnst63h = __lsx_vldi(1087);           \
+                                                        \
+    p2_m = __lsx_vxori_b(p2, 0x80);                     \
+    p1_m = __lsx_vxori_b(p1, 0x80);                     \
+    p0_m = __lsx_vxori_b(p0, 0x80);                     \
+    q0_m = __lsx_vxori_b(q0, 0x80);                     \
+    q1_m = __lsx_vxori_b(q1, 0x80);                     \
+    q2_m = __lsx_vxori_b(q2, 0x80);                     \
+                                                        \
+    filt = __lsx_vssub_b(p1_m, q1_m);                   \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);              \
+    filt = __lsx_vand_v(filt, mask);                    \
+                                                        \
+    t2 = __lsx_vand_v(filt, hev);                       \
+    hev = __lsx_vxori_b(hev, 0xff);                     \
+    filt = __lsx_vand_v(hev, filt);                     \
+    t1 = __lsx_vsadd_b(t2, cnst4b);                     \
+    t1 = __lsx_vsra_b(t1, cnst3b);                      \
+    t2 = __lsx_vsadd_b(t2, cnst3b);                     \
+    t2 = __lsx_vsra_b(t2, cnst3b);                      \
+    q0_m = __lsx_vssub_b(q0_m, t1);                     \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                     \
+    filt_sign = __lsx_vslti_b(filt, 0);                 \
+    filt_r = __lsx_vilvl_b(filt_sign, filt);            \
+    filt_l = __lsx_vilvh_b(filt_sign, filt);            \
+    temp0 = __lsx_vmul_h(filt_r, cnst9h);               \
+    temp1 = __lsx_vadd_h(temp0, cnst63h);               \
+    temp2 = __lsx_vmul_h(filt_l, cnst9h);               \
+    temp3 = __lsx_vadd_h(temp2, cnst63h);               \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q2_m = __lsx_vssub_b(q2_m, u);                      \
+    p2_m = __lsx_vsadd_b(p2_m, u);                      \
+    q2 = __lsx_vxori_b(q2_m, 0x80);                     \
+    p2 = __lsx_vxori_b(p2_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q1_m = __lsx_vssub_b(q1_m, u);                      \
+    p1_m = __lsx_vsadd_b(p1_m, u);                      \
+    q1 = __lsx_vxori_b(q1_m, 0x80);                     \
+    p1 = __lsx_vxori_b(p1_m, 0x80);                     \
+                                                        \
+    temp1 = __lsx_vadd_h(temp1, temp0);                 \
+    temp3 = __lsx_vadd_h(temp3, temp2);                 \
+                                                        \
+    u = __lsx_vssrani_b_h(temp3, temp1, 7);             \
+    q0_m = __lsx_vssub_b(q0_m, u);                      \
+    p0_m = __lsx_vsadd_b(p0_m, u);                      \
+    q0 = __lsx_vxori_b(q0_m, 0x80);                     \
+    p0 = __lsx_vxori_b(p0_m, 0x80);                     \
+  } while (0)
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP8_ST6x1_B(in0, in0_idx, in1, in1_idx, pdst, stride) \
+  do {                                                        \
+    __lsx_vstelm_w(in0, pdst, 0, in0_idx);                    \
+    __lsx_vstelm_h(in1, pdst + stride, 0, in1_idx);           \
+  } while (0)
+
+static void loop_filter_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                              const uint8_t *b_limit0_ptr,
+                                              const uint8_t *limit0_ptr,
+                                              const uint8_t *thresh0_ptr,
+                                              const uint8_t *b_limit1_ptr,
+                                              const uint8_t *limit1_ptr,
+                                              const uint8_t *thresh1_ptr) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch_x4, src, -pitch_x3, src, -pitch_x2, src,
+            -pitch, p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch_x2, q1, q2);
+  q3 = __lsx_vldx(src, pitch_x3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstx(p1, src, -pitch_x2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+static void loop_filter_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                            const uint8_t *b_limit0_ptr,
+                                            const uint8_t *limit0_ptr,
+                                            const uint8_t *thresh0_ptr,
+                                            const uint8_t *b_limit1_ptr,
+                                            const uint8_t *limit1_ptr,
+                                            const uint8_t *thresh1_ptr) {
+  uint8_t *src_tmp0 = src - 4;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  row0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+
+  row8 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_tmp0, pitch_x3);
+  src_tmp0 += pitch_x4;
+  row12 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, pitch, src_tmp0, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_tmp0, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp3, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp4, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(tmp5, src, 0, 3);
+}
+
+static void loop_filter_horizontal_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                               int32_t pitch,
+                                               const uint8_t b_limit_in,
+                                               const uint8_t limit_in,
+                                               const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  DUP4_ARG2(__lsx_vldx, src_u, -pitch_x4, src_u, -pitch_x3, src_u, -pitch_x2,
+            src_u, -pitch, p3_u, p2_u, p1_u, p0_u);
+  q0_u = __lsx_vld(src_u, 0);
+  DUP2_ARG2(__lsx_vldx, src_u, pitch, src_u, pitch_x2, q1_u, q2_u);
+  q3_u = __lsx_vldx(src_u, pitch_x3);
+
+  DUP4_ARG2(__lsx_vldx, src_v, -pitch_x4, src_v, -pitch_x3, src_v, -pitch_x2,
+            src_v, -pitch, p3_v, p2_v, p1_v, p0_v);
+  q0_v = __lsx_vld(src_v, 0);
+  DUP2_ARG2(__lsx_vldx, src_v, pitch, src_v, pitch_x2, q1_v, q2_v);
+  q3_v = __lsx_vldx(src_v, pitch_x3);
+
+  /* right 8 element of p3 are u pixel and
+     left 8 element of p3 are v pixel */
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  __lsx_vstelm_d(q1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(q0, src_u, 0, 0);
+  __lsx_vstelm_d(p0, src_u - pitch, 0, 0);
+  __lsx_vstelm_d(p1, src_u - pitch_x2, 0, 0);
+
+  __lsx_vstelm_d(q1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(q0, src_v, 0, 1);
+  __lsx_vstelm_d(p0, src_v - pitch, 0, 1);
+  __lsx_vstelm_d(p1, src_v - pitch_x2, 0, 1);
+}
+
+static void loop_filter_vertical_edge_uv_lsx(uint8_t *src_u, uint8_t *src_v,
+                                             int32_t pitch,
+                                             const uint8_t b_limit_in,
+                                             const uint8_t limit_in,
+                                             const uint8_t thresh_in) {
+  uint8_t *src_u_tmp, *src_v_tmp;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+
+  thresh = __lsx_vreplgr2vr_b(thresh_in);
+  limit = __lsx_vreplgr2vr_b(limit_in);
+  b_limit = __lsx_vreplgr2vr_b(b_limit_in);
+
+  src_u_tmp = src_u - 4;
+  row0 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row1, row2);
+  row3 = __lsx_vldx(src_u_tmp, pitch_x3);
+  src_u_tmp += pitch_x4;
+  row4 = __lsx_vld(src_u_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_u_tmp, pitch, src_u_tmp, pitch_x2, row5, row6);
+  row7 = __lsx_vldx(src_u_tmp, pitch_x3);
+
+  src_v_tmp = src_v - 4;
+  row8 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row9, row10);
+  row11 = __lsx_vldx(src_v_tmp, pitch_x3);
+  src_v_tmp += pitch_x4;
+  row12 = __lsx_vld(src_v_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_v_tmp, pitch, src_v_tmp, pitch_x2, row13, row14);
+  row15 = __lsx_vldx(src_v_tmp, pitch_x3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+
+  tmp0 = __lsx_vilvh_b(p0, p1);
+  tmp1 = __lsx_vilvh_b(q1, q0);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src_u_tmp += 2;
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp2, src_u_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp3, src_u_tmp, 0, 0);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp3, src_u_tmp + pitch_x3, 0, 3);
+
+  src_v_tmp += 2;
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x4, 0, 0);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x3, 0, 1);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp4, src_v_tmp - pitch, 0, 3);
+
+  __lsx_vstelm_w(tmp5, src_v_tmp, 0, 0);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x2, 0, 2);
+  __lsx_vstelm_w(tmp5, src_v_tmp + pitch_x3, 0, 3);
+}
+
+static inline void mbloop_filter_horizontal_edge_y_lsx(
+    uint8_t *src, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3, p2, p1, p0);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0, q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  temp_src = src - pitch_x3;
+  __lsx_vstx(p2, temp_src, 0);
+  __lsx_vstx(p1, temp_src, pitch);
+  __lsx_vstx(p0, temp_src, pitch_x2);
+  __lsx_vstx(q0, temp_src, pitch_x3);
+  temp_src += pitch_x4;
+  __lsx_vstx(q1, temp_src, 0);
+  __lsx_vstx(q2, temp_src, pitch);
+}
+
+static inline void mbloop_filter_horizontal_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i p3_u, p2_u, p1_u, p0_u, q3_u, q2_u, q1_u, q0_u;
+  __m128i p3_v, p2_v, p1_v, p0_v, q3_v, q2_v, q1_v, q0_v;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  temp_src = src_u - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_u, p2_u, p1_u, p0_u);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_u, q1_u, q2_u, q3_u);
+  temp_src = src_v - pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, p3_v, p2_v, p1_v, p0_v);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, q0_v, q1_v, q2_v, q3_v);
+
+  DUP4_ARG2(__lsx_vilvl_d, p3_v, p3_u, p2_v, p2_u, p1_v, p1_u, p0_v, p0_u, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vilvl_d, q0_v, q0_u, q1_v, q1_u, q2_v, q2_u, q3_v, q3_u, q0,
+            q1, q2, q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  src_u -= pitch_x3;
+  __lsx_vstelm_d(p2, src_u, 0, 0);
+  __lsx_vstelm_d(p1, src_u + pitch, 0, 0);
+  __lsx_vstelm_d(p0, src_u + pitch_x2, 0, 0);
+  __lsx_vstelm_d(q0, src_u + pitch_x3, 0, 0);
+  src_u += pitch_x4;
+  __lsx_vstelm_d(q1, src_u, 0, 0);
+  src_u += pitch;
+  __lsx_vstelm_d(q2, src_u, 0, 0);
+
+  src_v -= pitch_x3;
+  __lsx_vstelm_d(p2, src_v, 0, 1);
+  __lsx_vstelm_d(p1, src_v + pitch, 0, 1);
+  __lsx_vstelm_d(p0, src_v + pitch_x2, 0, 1);
+  __lsx_vstelm_d(q0, src_v + pitch_x3, 0, 1);
+  src_v += pitch_x4;
+  __lsx_vstelm_d(q1, src_v, 0, 1);
+  src_v += pitch;
+  __lsx_vstelm_d(q2, src_v, 0, 1);
+}
+
+static inline void mbloop_filter_vertical_edge_y_lsx(uint8_t *src,
+                                                     int32_t pitch,
+                                                     const uint8_t b_limit_in,
+                                                     const uint8_t limit_in,
+                                                     const uint8_t thresh_in) {
+  uint8_t *temp_src;
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+  temp_src = src - 4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row0, row1, row2, row3);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row4, row5, row6, row7);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row8, row9, row10, row11);
+  temp_src += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, temp_src, 0, temp_src, pitch, temp_src, pitch_x2,
+            temp_src, pitch_x3, row12, row13, row14, row15);
+  temp_src -= pitch_x4;
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  temp_src = src - 3;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, temp_src, 4);
+  temp_src += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, temp_src, 4);
+}
+
+static inline void mbloop_filter_vertical_edge_uv_lsx(
+    uint8_t *src_u, uint8_t *src_v, int32_t pitch, const uint8_t b_limit_in,
+    const uint8_t limit_in, const uint8_t thresh_in) {
+  int32_t pitch_x2 = pitch << 1;
+  int32_t pitch_x3 = pitch_x2 + pitch;
+  int32_t pitch_x4 = pitch << 2;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i mask, hev, flat, thresh, limit, b_limit;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7, row8;
+  __m128i row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+
+  DUP2_ARG2(__lsx_vldrepl_b, &b_limit_in, 0, &limit_in, 0, b_limit, limit);
+  thresh = __lsx_vldrepl_b(&thresh_in, 0);
+
+  src_u -= 4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row0, row1, row2, row3);
+  src_u += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_u, 0, src_u, pitch, src_u, pitch_x2, src_u,
+            pitch_x3, row4, row5, row6, row7);
+  src_v -= 4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row8, row9, row10, row11);
+  src_v += pitch_x4;
+  DUP4_ARG2(__lsx_vldx, src_v, 0, src_v, pitch, src_v, pitch_x2, src_v,
+            pitch_x3, row12, row13, row14, row15);
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP8_MBFILTER(p2, p1, p0, q0, q1, q2, mask, hev);
+
+  DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp3 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp4 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, tmp0, tmp1);
+  tmp6 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp7 = __lsx_vilvh_h(tmp1, tmp0);
+  tmp2 = __lsx_vilvl_b(q2, q1);
+  tmp5 = __lsx_vilvh_b(q2, q1);
+
+  src_u += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp3, 0, tmp2, 0, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 1, tmp2, 1, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 2, tmp2, 2, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp3, 3, tmp2, 3, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 0, tmp2, 4, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 1, tmp2, 5, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 2, tmp2, 6, src_u, 4);
+  src_u += pitch;
+  VP8_ST6x1_B(tmp4, 3, tmp2, 7, src_u, 4);
+
+  src_v += 1 - pitch_x4;
+  VP8_ST6x1_B(tmp6, 0, tmp5, 0, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 1, tmp5, 1, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 2, tmp5, 2, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp6, 3, tmp5, 3, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 0, tmp5, 4, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 1, tmp5, 5, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 2, tmp5, 6, src_v, 4);
+  src_v += pitch;
+  VP8_ST6x1_B(tmp7, 3, tmp5, 7, src_v, 4);
+}
+
+void vp8_loop_filter_mbh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_horizontal_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                      *lpf_info_ptr->lim,
+                                      *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_horizontal_edge_uv_lsx(
+        src_u, src_v, pitch_u_v, *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+        *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_mbv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                             int32_t pitch_y, int32_t pitch_u_v,
+                             loop_filter_info *lpf_info_ptr) {
+  mbloop_filter_vertical_edge_y_lsx(src_y, pitch_y, *lpf_info_ptr->mblim,
+                                    *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  if (src_u) {
+    mbloop_filter_vertical_edge_uv_lsx(src_u, src_v, pitch_u_v,
+                                       *lpf_info_ptr->mblim, *lpf_info_ptr->lim,
+                                       *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bh_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_horizontal_4_dual_lsx(src_y + 4 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 8 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  loop_filter_horizontal_4_dual_lsx(src_y + 12 * pitch_y, pitch_y,
+                                    lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                    lpf_info_ptr->hev_thr, lpf_info_ptr->blim,
+                                    lpf_info_ptr->lim, lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_horizontal_edge_uv_lsx(
+        src_u + (4 * pitch_u_v), src_v + (4 * pitch_u_v), pitch_u_v,
+        *lpf_info_ptr->blim, *lpf_info_ptr->lim, *lpf_info_ptr->hev_thr);
+  }
+}
+
+void vp8_loop_filter_bv_lsx(uint8_t *src_y, uint8_t *src_u, uint8_t *src_v,
+                            int32_t pitch_y, int32_t pitch_u_v,
+                            loop_filter_info *lpf_info_ptr) {
+  loop_filter_vertical_4_dual_lsx(src_y + 4, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 8, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  loop_filter_vertical_4_dual_lsx(src_y + 12, pitch_y, lpf_info_ptr->blim,
+                                  lpf_info_ptr->lim, lpf_info_ptr->hev_thr,
+                                  lpf_info_ptr->blim, lpf_info_ptr->lim,
+                                  lpf_info_ptr->hev_thr);
+  if (src_u) {
+    loop_filter_vertical_edge_uv_lsx(src_u + 4, src_v + 4, pitch_u_v,
+                                     *lpf_info_ptr->blim, *lpf_info_ptr->lim,
+                                     *lpf_info_ptr->hev_thr);
+  }
+}
diff --git a/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c b/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
new file mode 100644
index 0000000..cd7ba54
--- /dev/null
+++ b/libvpx/vp8/common/loongarch/sixtap_filter_lsx.c
@@ -0,0 +1,1903 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vp8/common/filter.h"
+#include "vpx_ports/mem.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+DECLARE_ALIGNED(16, static const int8_t, vp8_subpel_filters_lsx[7][8]) = {
+  { 0, -6, 123, 12, -1, 0, 0, 0 },
+  { 2, -11, 108, 36, -8, 1, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -9, 93, 50, -6, 0, 0, 0 },
+  { 3, -16, 77, 77, -16, 3, 0, 0 }, /* New 1/2 pel 6 tap filter */
+  { 0, -6, 50, 93, -9, 0, 0, 0 },
+  { 1, -8, 36, 108, -11, 2, 0, 0 }, /* New 1/4 pel 6 tap filter */
+  { 0, -1, 12, 123, -6, 0, 0, 0 },
+};
+
+static const uint8_t vp8_mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static INLINE __m128i dpadd_h3(__m128i in0, __m128i in1, __m128i in2,
+                               __m128i coeff0, __m128i coeff1, __m128i coeff2) {
+  __m128i out0_m;
+
+  out0_m = __lsx_vdp2_h_b(in0, coeff0);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in1, coeff1);
+  out0_m = __lsx_vdp2add_h_b(out0_m, in2, coeff2);
+
+  return out0_m;
+}
+
+static INLINE __m128i horiz_6tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i mask2,
+                                      __m128i filt_h0, __m128i filt_h1,
+                                      __m128i filt_h2) {
+  __m128i vec0_m, vec1_m, vec2_m;
+  __m128i hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  vec2_m = __lsx_vshuf_b(src1, src0, mask2);
+  hz_out_m = dpadd_h3(vec0_m, vec1_m, vec2_m, filt_h0, filt_h1, filt_h2);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+static INLINE __m128i filt_4tap_dpadd_h(__m128i vec0, __m128i vec1,
+                                        __m128i filt0, __m128i filt1) {
+  __m128i tmp_m;
+
+  tmp_m = __lsx_vdp2_h_b(vec0, filt0);
+  tmp_m = __lsx_vdp2add_h_b(tmp_m, vec1, filt1);
+
+  return tmp_m;
+}
+
+static INLINE __m128i horiz_4tap_filt(__m128i src0, __m128i src1, __m128i mask0,
+                                      __m128i mask1, __m128i filt_h0,
+                                      __m128i filt_h1) {
+  __m128i vec0_m, vec1_m, hz_out_m;
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src1, src0, mask1, vec0_m,
+            vec1_m);
+  hz_out_m = filt_4tap_dpadd_h(vec0_m, vec1_m, filt_h0, filt_h1);
+  hz_out_m = __lsx_vsrari_h(hz_out_m, VP8_FILTER_SHIFT);
+  hz_out_m = __lsx_vsat_h(hz_out_m, 7);
+
+  return hz_out_m;
+}
+
+#define HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   mask2, filt0, filt1, filt2, out0, out1) \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m;                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask2, src3, src2, mask2, vec4_m, \
+              vec5_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,    \
+                                   mask2, filt0, filt1, filt2, out0, out1,  \
+                                   out2, out3)                              \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m,  \
+              vec3_m);                                                      \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0,  \
+              vec3_m, filt0, out0, out1, out2, out3);                       \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m,  \
+              vec1_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m,  \
+              vec3_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, vec4_m,  \
+              vec5_m);                                                      \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask2, src3, src3, mask2, vec6_m,  \
+              vec7_m);                                                      \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1,  \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,   \
+              out3);                                                        \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec4_m, filt2, out1, vec5_m, filt2,  \
+              out2, vec6_m, filt2, out3, vec7_m, filt2, out0, out1, out2,   \
+              out3);                                                        \
+  } while (0)
+
+#define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1)               \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask0, src3, src2, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, out0, out1);   \
+    DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask1, src3, src2, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP2_ARG3(__lsx_vdp2add_h_b, out0, vec2_m, filt1, out1, vec3_m, filt1, \
+              out0, out1);                                                 \
+  } while (0)
+
+#define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,   \
+                                   filt0, filt1, out0, out1, out2, out3)   \
+  do {                                                                     \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                \
+                                                                           \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask0, src3, src3, mask0, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG2(__lsx_vdp2_h_b, vec0_m, filt0, vec1_m, filt0, vec2_m, filt0, \
+              vec3_m, filt0, out0, out1, out2, out3);                      \
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, vec0_m, \
+              vec1_m);                                                     \
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask1, src3, src3, mask1, vec2_m, \
+              vec3_m);                                                     \
+    DUP4_ARG3(__lsx_vdp2add_h_b, out0, vec0_m, filt1, out1, vec1_m, filt1, \
+              out2, vec2_m, filt1, out3, vec3_m, filt1, out0, out1, out2,  \
+              out3);                                                       \
+  } while (0)
+
+static inline void common_hz_6t_4x4_lsx(uint8_t *RESTRICT src,
+                                        int32_t src_stride,
+                                        uint8_t *RESTRICT dst,
+                                        int32_t dst_stride,
+                                        const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_6t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride_x2 << 1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_6TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out2, out3);
+
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_6t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_6t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src += src_stride_x4;
+  HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, filt0,
+                             filt1, filt2, out0, out1, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+  dst += dst_stride_x4;
+
+  for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    src += src_stride_x4;
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, filt0, filt1, filt2;
+  __m128i mask0, mask1, mask2, out;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 2;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    src += src_stride_x4 - 8;
+
+    HORIZ_6TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out0, out1, out2, out3);
+    HORIZ_6TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
+                               filt0, filt1, filt2, out4, out5, out6, out7);
+    DUP4_ARG2(__lsx_vsrari_h, out0, VP8_FILTER_SHIFT, out1, VP8_FILTER_SHIFT,
+              out2, VP8_FILTER_SHIFT, out3, VP8_FILTER_SHIFT, out0, out1, out2,
+              out3);
+    DUP4_ARG2(__lsx_vsrari_h, out4, VP8_FILTER_SHIFT, out5, VP8_FILTER_SHIFT,
+              out6, VP8_FILTER_SHIFT, out7, VP8_FILTER_SHIFT, out4, out5, out6,
+              out7);
+    DUP4_ARG2(__lsx_vsat_h, out0, 7, out1, 7, out2, 7, out3, 7, out0, out1,
+              out2, out3);
+    DUP4_ARG2(__lsx_vsat_h, out4, 7, out5, 7, out6, 7, out7, 7, out4, out5,
+              out6, out7);
+    out = __lsx_vpickev_b(out1, out0);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vst(out, dst, 0);
+    out = __lsx_vpickev_b(out3, out2);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride);
+    out = __lsx_vpickev_b(out5, out4);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x2);
+    out = __lsx_vpickev_b(out7, out6);
+    out = __lsx_vxori_b(out, 128);
+    __lsx_vstx(out, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_6t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  filt2 = __lsx_vldrepl_h(filter, 4);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vxori_b, src2110, 128, src4332, 128, src2110, src4332);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP2_ARG2(__lsx_vilvl_d, src65_r, src54_r, src87_r, src76_r, src6554,
+              src8776);
+    DUP2_ARG2(__lsx_vxori_b, src6554, 128, src8776, 128, src6554, src8776);
+    out0 = dpadd_h3(src2110, src4332, src6554, filt0, filt1, filt2);
+    out1 = dpadd_h3(src4332, src6554, src8776, filt0, filt1, filt2);
+
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    src2110 = src6554;
+    src4332 = src8776;
+    src4 = src8;
+  }
+}
+
+static void common_vt_6t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src7, src8, src9, src10;
+  __m128i src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
+  __m128i src109_r, filt0, filt1, filt2;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src2, src1, src4, src3,
+            src10_r, src32_r, src21_r, src43_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src7, src4, src8, src7, src9, src8, src10, src9,
+              src76_r, src87_r, src98_r, src109_r);
+    out0_r = dpadd_h3(src10_r, src32_r, src76_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src87_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src76_r, src98_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src87_r, src109_r, filt0, filt1, filt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src76_r;
+    src32_r = src98_r;
+    src21_r = src87_r;
+    src43_r = src109_r;
+    src4 = src10;
+  }
+}
+
+static void common_vt_6t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
+  __m128i src65_l, src87_l, filt0, filt1, filt2;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride_x2;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  filt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_r, src32_r, src43_r, src21_r);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src4, src3, src2, src1,
+            src10_l, src32_l, src43_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_r, src65_r, src76_r, src87_r);
+    DUP4_ARG2(__lsx_vilvh_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              src54_l, src65_l, src76_l, src87_l);
+    out0_r = dpadd_h3(src10_r, src32_r, src54_r, filt0, filt1, filt2);
+    out1_r = dpadd_h3(src21_r, src43_r, src65_r, filt0, filt1, filt2);
+    out2_r = dpadd_h3(src32_r, src54_r, src76_r, filt0, filt1, filt2);
+    out3_r = dpadd_h3(src43_r, src65_r, src87_r, filt0, filt1, filt2);
+    out0_l = dpadd_h3(src10_l, src32_l, src54_l, filt0, filt1, filt2);
+    out1_l = dpadd_h3(src21_l, src43_l, src65_l, filt0, filt1, filt2);
+    out2_l = dpadd_h3(src32_l, src54_l, src76_l, filt0, filt1, filt2);
+    out3_l = dpadd_h3(src43_l, src65_l, src87_l, filt0, filt1, filt2);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src32_r = src76_r;
+    src21_r = src65_r;
+    src43_r = src87_r;
+    src10_l = src54_l;
+    src32_l = src76_l;
+    src21_l = src65_l;
+    src43_l = src87_l;
+    src4 = src8;
+  }
+}
+
+static void common_hv_6ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, tmp0, tmp1;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, filt_vt0, filt_vt1, filt_vt2, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src0, src1);
+  src2 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src3, src4);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    src6 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+
+    src7 = __lsx_vld(src, 0);
+    src8 = __lsx_vldx(src, src_stride);
+    src += src_stride_x2;
+
+    DUP2_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src7, src8);
+    hz_out7 = horiz_6tap_filt(src7, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static void common_hv_6ht_6vt_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_hz2;
+  __m128i mask0, mask1, mask2, vec0, vec1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride_x2);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vldx(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out4 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out2,
+            hz_out1, hz_out4, hz_out3, out0, out1, out3, out4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_6tap_filt(src7, src7, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out7 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out7, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_6tap_filt(src8, src8, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    out6 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, VP8_FILTER_SHIFT, tmp3, tmp2,
+              VP8_FILTER_SHIFT, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out7;
+    out3 = out5;
+    out4 = out6;
+  }
+}
+
+static void common_hv_6ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hz_4t_4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+  out0 = __lsx_vxori_b(out0, 128);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+}
+
+static void common_hz_4t_4x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  src += src_stride_x4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out0, out1);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride_x3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0, filt1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+            VP8_FILTER_SHIFT, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_4t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_4t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, filt0, filt1, mask0, mask1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src1, src2, src3);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hz_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i filt0, filt1, mask0, mask1;
+  __m128i filt, out0, out1, out2, out3, out4, out5, out6, out7;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1;
+
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src0, src2, src4, src6);
+    src += 8;
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src1, src3, src5, src7);
+    src += src_stride_x4 - 8;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src6, 128, src7, 128, src4,
+              src5, src6, src7);
+    HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
+                               filt1, out0, out1, out2, out3);
+    HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
+                               filt1, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out1, out0, VP8_FILTER_SHIFT, out3, out2,
+              VP8_FILTER_SHIFT, out5, out4, VP8_FILTER_SHIFT, out7, out6,
+              VP8_FILTER_SHIFT, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out2, 128, out3, 128, out0,
+              out1, out2, out3);
+    __lsx_vstx(out0, dst, 0);
+    __lsx_vstx(out1, dst, dst_stride);
+    __lsx_vstx(out2, dst, dst_stride_x2);
+    __lsx_vstx(out3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_vt_4t_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, filt0, filt1, out0, out1;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filt0, filt1);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src1 = __lsx_vld(src, 0);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  src2110 = __lsx_vilvl_d(src21_r, src10_r);
+  src2110 = __lsx_vxori_b(src2110, 128);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src += src_stride_x3;
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src32_r, src43_r);
+    src4332 = __lsx_vilvl_d(src43_r, src32_r);
+    src4332 = __lsx_vxori_b(src4332, 128);
+    out0 = filt_4tap_dpadd_h(src2110, src4332, filt0, filt1);
+
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vilvl_b, src5, src4, src2, src5, src54_r, src65_r);
+    src2110 = __lsx_vilvl_d(src65_r, src54_r);
+    src2110 = __lsx_vxori_b(src2110, 128);
+    out1 = filt_4tap_dpadd_h(src4332, src2110, filt0, filt1);
+    out0 = __lsx_vssrarni_b_h(out1, out0, VP8_FILTER_SHIFT);
+    out0 = __lsx_vxori_b(out0, 128);
+
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+  }
+}
+
+static void common_vt_4t_8w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                uint8_t *RESTRICT dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src7, src8, src9, src10;
+  __m128i src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
+  __m128i tmp0, tmp1;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src7, src8, src9, src10);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src2, src8, src7, src9, src8, src10, src9,
+              src72_r, src87_r, src98_r, src109_r);
+    out0_r = filt_4tap_dpadd_h(src10_r, src72_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src87_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src72_r, src98_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src87_r, src109_r, filt0, filt1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1_r, out0_r, VP8_FILTER_SHIFT, out3_r,
+              out2_r, VP8_FILTER_SHIFT, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(tmp1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    src10_r = src98_r;
+    src21_r = src109_r;
+    src2 = src10;
+  }
+}
+
+static void common_vt_4t_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
+  __m128i src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
+
+  src -= src_stride;
+  filt = __lsx_vld(filter, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt0, filt1);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src10_r, src21_r);
+  DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, src10_l, src21_l);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    DUP4_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_r, src43_r, src54_r, src65_r);
+    DUP4_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, src5, src4, src6, src5,
+              src32_l, src43_l, src54_l, src65_l);
+    out0_r = filt_4tap_dpadd_h(src10_r, src32_r, filt0, filt1);
+    out1_r = filt_4tap_dpadd_h(src21_r, src43_r, filt0, filt1);
+    out2_r = filt_4tap_dpadd_h(src32_r, src54_r, filt0, filt1);
+    out3_r = filt_4tap_dpadd_h(src43_r, src65_r, filt0, filt1);
+    out0_l = filt_4tap_dpadd_h(src10_l, src32_l, filt0, filt1);
+    out1_l = filt_4tap_dpadd_h(src21_l, src43_l, filt0, filt1);
+    out2_l = filt_4tap_dpadd_h(src32_l, src54_l, filt0, filt1);
+    out3_l = filt_4tap_dpadd_h(src43_l, src65_l, filt0, filt1);
+    DUP4_ARG3(__lsx_vssrarni_b_h, out0_l, out0_r, VP8_FILTER_SHIFT, out1_l,
+              out1_r, VP8_FILTER_SHIFT, out2_l, out2_r, VP8_FILTER_SHIFT,
+              out3_l, out3_r, VP8_FILTER_SHIFT, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp2, 128, tmp3, 128, tmp0,
+              tmp1, tmp2, tmp3);
+    __lsx_vstx(tmp0, dst, 0);
+    __lsx_vstx(tmp1, dst, dst_stride);
+    __lsx_vstx(tmp2, dst, dst_stride_x2);
+    __lsx_vstx(tmp3, dst, dst_stride_x3);
+    dst += dst_stride_x4;
+
+    src10_r = src54_r;
+    src21_r = src65_r;
+    src10_l = src54_l;
+    src21_l = src65_l;
+    src2 = src6;
+  }
+}
+
+static void common_hv_4ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src2, mask0, mask1, filt_hz0, filt_hz1);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+
+    DUP2_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src3, src4);
+    hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src5, src6);
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_4ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, filt_hz0, filt_hz1;
+  __m128i mask0, mask1, out0, out1;
+  __m128i filt, filt_vt0, filt_vt1, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i vec0, vec1, vec2, vec3, vec4;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+    hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    vec4 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec4, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec0, vec1);
+    tmp3 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+
+    vec0 = vec4;
+    vec2 = vec1;
+  }
+}
+
+static void common_hv_4ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_6ht_4vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt_vt0, filt_vt1, tmp0, tmp1, vec0, vec1, vec2;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+  src -= 2;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  filt_hz2 = __lsx_vldrepl_h(filter_horiz, 4);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  src1 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, -src_stride, src, src_stride, src0, src2);
+  src += src_stride_x2;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+
+  hz_out0 = horiz_6tap_filt(src0, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src3 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src4, src5);
+    src6 = __lsx_vldx(src, src_stride_x3);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out2 = __lsx_vshuf_b(hz_out3, hz_out1, shuff);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out5 = horiz_6tap_filt(src5, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    vec2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp1 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp1, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = hz_out5;
+    vec0 = vec2;
+  }
+}
+
+static inline void common_hv_6ht_4vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6;
+  __m128i filt_hz0, filt_hz1, filt_hz2, mask0, mask1, mask2;
+  __m128i filt, filt_vt0, filt_vt1, hz_out0, hz_out1, hz_out2, hz_out3;
+  __m128i tmp0, tmp1, tmp2, tmp3, vec0, vec1, vec2, vec3;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= (2 + src_stride);
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  filt_hz2 = __lsx_vreplvei_h(filt, 2);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+
+  DUP2_ARG2(__lsx_vldx, src, 0, src, src_stride, src0, src1);
+  src2 = __lsx_vldx(src, src_stride_x2);
+  src += src_stride_x3;
+
+  DUP2_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src0, src1);
+  src2 = __lsx_vxori_b(src2, 128);
+  hz_out0 = horiz_6tap_filt(src0, src0, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out1 = horiz_6tap_filt(src1, src1, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  hz_out2 = horiz_6tap_filt(src2, src2, mask0, mask1, mask2, filt_hz0, filt_hz1,
+                            filt_hz2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out2, hz_out1, vec0, vec2);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src3, src4, src5, src6);
+    src += src_stride_x4;
+    DUP4_ARG2(__lsx_vxori_b, src3, 128, src4, 128, src5, 128, src6, 128, src3,
+              src4, src5, src6);
+
+    hz_out3 = horiz_6tap_filt(src3, src3, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec1 = __lsx_vpackev_b(hz_out3, hz_out2);
+    tmp0 = filt_4tap_dpadd_h(vec0, vec1, filt_vt0, filt_vt1);
+
+    hz_out0 = horiz_6tap_filt(src4, src4, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec3 = __lsx_vpackev_b(hz_out0, hz_out3);
+    tmp1 = filt_4tap_dpadd_h(vec2, vec3, filt_vt0, filt_vt1);
+
+    hz_out1 = horiz_6tap_filt(src5, src5, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = filt_4tap_dpadd_h(vec1, vec0, filt_vt0, filt_vt1);
+
+    hz_out2 = horiz_6tap_filt(src6, src6, mask0, mask1, mask2, filt_hz0,
+                              filt_hz1, filt_hz2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out3, hz_out2, hz_out1, vec1, vec2);
+    tmp3 = filt_4tap_dpadd_h(vec1, vec2, filt_vt0, filt_vt1);
+
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+  }
+}
+
+static void common_hv_6ht_4vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_6ht_4vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_6ht_4vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+static void common_hv_4ht_6vt_4w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                     uint8_t *RESTRICT dst, int32_t dst_stride,
+                                     const int8_t *filter_horiz,
+                                     const int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, filt_vt0, filt_vt1, filt_vt2, mask0, mask1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, tmp0, tmp1, out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 16);
+
+  src -= 1;
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filt_hz0,
+            filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, -src_stride_x2, src, -src_stride, src, src_stride,
+            src, src_stride_x2, src0, src1, src3, src4);
+  src2 = __lsx_vld(src, 0);
+  src += src_stride_x3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src4, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+
+  DUP2_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filt_vt0,
+            filt_vt1);
+  filt_vt2 = __lsx_vldrepl_h(filter_vert, 4);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride_x2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride_x3);
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    src += src_stride_x4;
+
+    hz_out5 = horiz_4tap_filt(src5, src6, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out4 = __lsx_vshuf_b(hz_out5, hz_out3, shuff);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src8, mask0, mask1, filt_hz0, filt_hz1);
+    hz_out6 = __lsx_vshuf_b(hz_out7, hz_out5, shuff);
+    out3 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp1 = dpadd_h3(out1, out2, out3, filt_vt0, filt_vt1, filt_vt2);
+
+    tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+    tmp0 = __lsx_vxori_b(tmp0, 128);
+    __lsx_vstelm_w(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(tmp0, dst, 0, 3);
+    dst += dst_stride;
+
+    hz_out3 = hz_out7;
+    out0 = out2;
+    out1 = out3;
+  }
+}
+
+static inline void common_hv_4ht_6vt_8w_lsx(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height) {
+  uint32_t loop_cnt;
+  int32_t src_stride_x2 = src_stride << 1;
+  int32_t src_stride_x3 = src_stride_x2 + src_stride;
+  int32_t src_stride_x4 = src_stride << 2;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i filt_hz0, filt_hz1, mask0, mask1;
+  __m128i filt, filt_vt0, filt_vt1, filt_vt2, tmp0, tmp1, tmp2, tmp3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec0, vec1;
+
+  mask0 = __lsx_vld(vp8_mc_filt_mask_arr, 0);
+  src -= 1 + src_stride_x2;
+
+  filt = __lsx_vld(filter_horiz, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_hz0, filt_hz1);
+  mask1 = __lsx_vaddi_bu(mask0, 2);
+
+  DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+            src_stride_x3, src0, src1, src2, src3);
+  src += src_stride_x4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  src4 = __lsx_vxori_b(src4, 128);
+  hz_out0 = horiz_4tap_filt(src0, src0, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out1 = horiz_4tap_filt(src1, src1, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out2 = horiz_4tap_filt(src2, src2, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out3 = horiz_4tap_filt(src3, src3, mask0, mask1, filt_hz0, filt_hz1);
+  hz_out4 = horiz_4tap_filt(src4, src4, mask0, mask1, filt_hz0, filt_hz1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, out0, out1);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out2, hz_out1, hz_out4, hz_out3, out3, out4);
+
+  filt = __lsx_vld(filter_vert, 0);
+  DUP2_ARG2(__lsx_vreplvei_h, filt, 0, filt, 1, filt_vt0, filt_vt1);
+  filt_vt2 = __lsx_vreplvei_h(filt, 2);
+
+  for (loop_cnt = (height >> 2); loop_cnt--;) {
+    DUP4_ARG2(__lsx_vldx, src, 0, src, src_stride, src, src_stride_x2, src,
+              src_stride_x3, src5, src6, src7, src8);
+    src += src_stride_x4;
+
+    DUP4_ARG2(__lsx_vxori_b, src5, 128, src6, 128, src7, 128, src8, 128, src5,
+              src6, src7, src8);
+    hz_out5 = horiz_4tap_filt(src5, src5, mask0, mask1, filt_hz0, filt_hz1);
+    out2 = __lsx_vpackev_b(hz_out5, hz_out4);
+    tmp0 = dpadd_h3(out0, out1, out2, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out6 = horiz_4tap_filt(src6, src6, mask0, mask1, filt_hz0, filt_hz1);
+    out5 = __lsx_vpackev_b(hz_out6, hz_out5);
+    tmp1 = dpadd_h3(out3, out4, out5, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out7 = horiz_4tap_filt(src7, src7, mask0, mask1, filt_hz0, filt_hz1);
+    out6 = __lsx_vpackev_b(hz_out7, hz_out6);
+    tmp2 = dpadd_h3(out1, out2, out6, filt_vt0, filt_vt1, filt_vt2);
+
+    hz_out8 = horiz_4tap_filt(src8, src8, mask0, mask1, filt_hz0, filt_hz1);
+    out7 = __lsx_vpackev_b(hz_out8, hz_out7);
+    tmp3 = dpadd_h3(out4, out5, out7, filt_vt0, filt_vt1, filt_vt2);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, vec0, vec1);
+    DUP2_ARG2(__lsx_vxori_b, vec0, 128, vec1, 128, vec0, vec1);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x2, 0, 0);
+    __lsx_vstelm_d(vec1, dst + dst_stride_x3, 0, 1);
+    dst += dst_stride_x4;
+    hz_out4 = hz_out8;
+    out0 = out2;
+    out1 = out6;
+    out3 = out5;
+    out4 = out7;
+  }
+}
+
+static void common_hv_4ht_6vt_16w_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                      uint8_t *RESTRICT dst, int32_t dst_stride,
+                                      const int8_t *filter_horiz,
+                                      const int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_4ht_6vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  common_hv_4ht_6vt_8w_lsx(src + 8, src_stride, dst + 8, dst_stride,
+                           filter_horiz, filter_vert, height);
+}
+
+typedef void (*PVp8SixtapPredictFunc1)(
+    uint8_t *RESTRICT src, int32_t src_stride, uint8_t *RESTRICT dst,
+    int32_t dst_stride, const int8_t *filter_horiz, const int8_t *filter_vert,
+    int32_t height);
+
+typedef void (*PVp8SixtapPredictFunc2)(uint8_t *RESTRICT src,
+                                       int32_t src_stride,
+                                       uint8_t *RESTRICT dst,
+                                       int32_t dst_stride, const int8_t *filter,
+                                       int32_t height);
+
+void vp8_sixtap_predict4x4_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict4x4Funcs1[4] = {
+    common_hv_6ht_6vt_4w_lsx,
+    common_hv_6ht_4vt_4w_lsx,
+    common_hv_4ht_6vt_4w_lsx,
+    common_hv_4ht_4vt_4w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict4x4Funcs2[4] = { common_vt_6t_4w_lsx,
+                                                        common_vt_4t_4w_lsx,
+                                                        common_hz_6t_4w_lsx,
+                                                        common_hz_4t_4w_lsx };
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 4);
+                break;
+              case 1:
+                Predict4x4Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 4);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict4x4Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 4);
+                break;
+
+              case 1:
+                Predict4x4Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 4);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict4x4Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 4);
+            break;
+
+          case 1:
+            Predict4x4Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                4);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset) {
+        case 0: {
+          __m128i tp0;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+          src += src_stride;
+          tp0 = __lsx_vinsgr2vr_w(tp0, src, 0);
+
+          __lsx_vstelm_w(tp0, dst, 0, 0);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 1);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 2);
+          dst += dst_stride;
+          __lsx_vstelm_w(tp0, dst, 0, 3);
+          break;
+        }
+        case 2:
+        case 4:
+        case 6:
+          Predict4x4Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 4);
+          break;
+      }
+      switch (xoffset & 1) {
+        case 1:
+          Predict4x4Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              4);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict8x8_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                               int32_t xoffset, int32_t yoffset,
+                               uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict8x8Funcs1[4] = {
+    common_hv_6ht_6vt_8w_lsx,
+    common_hv_6ht_4vt_8w_lsx,
+    common_hv_4ht_6vt_8w_lsx,
+    common_hv_4ht_4vt_8w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict8x8Funcs2[4] = { common_vt_6t_8w_lsx,
+                                                        common_vt_4t_8w_lsx,
+                                                        common_hz_6t_8w_lsx,
+                                                        common_hz_4t_8w_lsx };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[0](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[1](src, src_stride, dst, dst_stride, h_filter,
+                                    v_filter + 1, 8);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict8x8Funcs1[2](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter, 8);
+                break;
+
+              case 1:
+                Predict8x8Funcs1[3](src, src_stride, dst, dst_stride,
+                                    h_filter + 1, v_filter + 1, 8);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict8x8Funcs2[0](src, src_stride, dst, dst_stride, v_filter, 8);
+            break;
+
+          case 1:
+            Predict8x8Funcs2[1](src, src_stride, dst, dst_stride, v_filter + 1,
+                                8);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict8x8Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                              8);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem8x8(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict8x8Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 8);
+          break;
+      }
+    }
+  }
+}
+
+void vp8_sixtap_predict16x16_lsx(uint8_t *RESTRICT src, int32_t src_stride,
+                                 int32_t xoffset, int32_t yoffset,
+                                 uint8_t *RESTRICT dst, int32_t dst_stride) {
+  const int8_t *h_filter = vp8_subpel_filters_lsx[xoffset - 1];
+  const int8_t *v_filter = vp8_subpel_filters_lsx[yoffset - 1];
+
+  static PVp8SixtapPredictFunc1 Predict16x16Funcs1[4] = {
+    common_hv_6ht_6vt_16w_lsx,
+    common_hv_6ht_4vt_16w_lsx,
+    common_hv_4ht_6vt_16w_lsx,
+    common_hv_4ht_4vt_16w_lsx,
+  };
+
+  static PVp8SixtapPredictFunc2 Predict16x16Funcs2[4] = {
+    common_vt_6t_16w_lsx, common_vt_4t_16w_lsx, common_hz_6t_16w_lsx,
+    common_hz_4t_16w_lsx
+  };
+
+  if (yoffset < 8 && xoffset < 8) {
+    if (yoffset) {
+      if (xoffset) {
+        switch (xoffset & 1) {
+          case 0:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[0](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[1](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+
+          case 1:
+            switch (yoffset & 1) {
+              case 0:
+                Predict16x16Funcs1[2](src, src_stride, dst, dst_stride,
+                                      h_filter + 1, v_filter, 16);
+                break;
+
+              case 1:
+                Predict16x16Funcs1[3](src, src_stride, dst, dst_stride,
+                                      h_filter, v_filter + 1, 16);
+                break;
+            }
+            break;
+        }
+      } else {
+        switch (yoffset & 1) {
+          case 0:
+            Predict16x16Funcs2[0](src, src_stride, dst, dst_stride, v_filter,
+                                  16);
+            break;
+
+          case 1:
+            Predict16x16Funcs2[1](src, src_stride, dst, dst_stride,
+                                  v_filter + 1, 16);
+            break;
+        }
+      }
+    } else {
+      switch (xoffset & 1) {
+        case 1:
+          Predict16x16Funcs2[3](src, src_stride, dst, dst_stride, h_filter + 1,
+                                16);
+          break;
+      }
+      switch (xoffset) {
+        case 0: vp8_copy_mem16x16(src, src_stride, dst, dst_stride); break;
+        case 2:
+        case 4:
+        case 6:
+          Predict16x16Funcs2[2](src, src_stride, dst, dst_stride, h_filter, 16);
+          break;
+      }
+    }
+  }
+}
diff --git a/libvpx/vp8/common/rtcd_defs.pl b/libvpx/vp8/common/rtcd_defs.pl
index 8452b5e..739a612 100644
--- a/libvpx/vp8/common/rtcd_defs.pl
+++ b/libvpx/vp8/common/rtcd_defs.pl
@@ -38,25 +38,25 @@
 specialize qw/vp8_dequant_idct_add mmx neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_dequant_idct_add_y_block/, "short *q, short *dq, unsigned char *dst, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_y_block sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_dequant_idct_add_uv_block/, "short *q, short *dq, unsigned char *dst_u, unsigned char *dst_v, int stride, char *eobs";
-specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_dequant_idct_add_uv_block sse2 neon dspr2 msa mmi lsx/;
 
 #
 # Loopfilter
 #
 add_proto qw/void vp8_loop_filter_mbv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bv/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bv sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_mbh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_mbh sse2 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_loop_filter_bh/, "unsigned char *y_ptr, unsigned char *u_ptr, unsigned char *v_ptr, int y_stride, int uv_stride, struct loop_filter_info *lfi";
-specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi/;
+specialize qw/vp8_loop_filter_bh sse2 neon dspr2 msa mmi lsx/;
 
 
 add_proto qw/void vp8_loop_filter_simple_mbv/, "unsigned char *y_ptr, int y_stride, const unsigned char *blimit";
@@ -108,7 +108,7 @@
 
 #idct1_scalar_add
 add_proto qw/void vp8_dc_only_idct_add/, "short input_dc, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride";
-specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi/;
+specialize qw/vp8_dc_only_idct_add mmx neon dspr2 msa mmi lsx/;
 
 #
 # RECON
@@ -146,16 +146,16 @@
 # Subpixel
 #
 add_proto qw/void vp8_sixtap_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict16x16 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x8/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict8x8 sse2 ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_sixtap_predict8x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_sixtap_predict8x4 sse2 ssse3 neon dspr2 msa mmi/;
 
 add_proto qw/void vp8_sixtap_predict4x4/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
-specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi/;
+specialize qw/vp8_sixtap_predict4x4 mmx ssse3 neon dspr2 msa mmi lsx/;
 
 add_proto qw/void vp8_bilinear_predict16x16/, "unsigned char *src_ptr, int src_pixels_per_line, int xoffset, int yoffset, unsigned char *dst_ptr, int dst_pitch";
 specialize qw/vp8_bilinear_predict16x16 sse2 ssse3 neon msa/;
@@ -184,10 +184,10 @@
 # Forward DCT
 #
 add_proto qw/void vp8_short_fdct4x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct4x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_fdct8x4/, "short *input, short *output, int pitch";
-specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi/;
+specialize qw/vp8_short_fdct8x4 sse2 neon msa mmi lsx/;
 
 add_proto qw/void vp8_short_walsh4x4/, "short *input, short *output, int pitch";
 specialize qw/vp8_short_walsh4x4 sse2 neon msa mmi/;
@@ -196,7 +196,7 @@
 # Quantizer
 #
 add_proto qw/void vp8_regular_quantize_b/, "struct block *, struct blockd *";
-specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi/;
+specialize qw/vp8_regular_quantize_b sse2 sse4_1 msa mmi lsx/;
 
 add_proto qw/void vp8_fast_quantize_b/, "struct block *, struct blockd *";
 specialize qw/vp8_fast_quantize_b sse2 ssse3 neon msa mmi/;
@@ -205,10 +205,10 @@
 # Block subtraction
 #
 add_proto qw/int vp8_block_error/, "short *coeff, short *dqcoeff";
-specialize qw/vp8_block_error sse2 msa/;
+specialize qw/vp8_block_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbblock_error/, "struct macroblock *mb, int dc";
-specialize qw/vp8_mbblock_error sse2 msa/;
+specialize qw/vp8_mbblock_error sse2 msa lsx/;
 
 add_proto qw/int vp8_mbuverror/, "struct macroblock *mb";
 specialize qw/vp8_mbuverror sse2 msa/;
@@ -216,20 +216,16 @@
 #
 # Motion search
 #
-add_proto qw/int vp8_full_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int sad_per_bit, int distance, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_full_search_sad sse3 sse4_1/;
-$vp8_full_search_sad_sse3=vp8_full_search_sadx3;
-$vp8_full_search_sad_sse4_1=vp8_full_search_sadx8;
-
 add_proto qw/int vp8_refining_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, int error_per_bit, int search_range, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
 specialize qw/vp8_refining_search_sad sse2 msa/;
 $vp8_refining_search_sad_sse2=vp8_refining_search_sadx4;
 $vp8_refining_search_sad_msa=vp8_refining_search_sadx4;
 
 add_proto qw/int vp8_diamond_search_sad/, "struct macroblock *x, struct block *b, struct blockd *d, union int_mv *ref_mv, union int_mv *best_mv, int search_param, int sad_per_bit, int *num00, struct variance_vtable *fn_ptr, int *mvcost[2], union int_mv *center_mv";
-specialize qw/vp8_diamond_search_sad sse2 msa/;
+specialize qw/vp8_diamond_search_sad sse2 msa lsx/;
 $vp8_diamond_search_sad_sse2=vp8_diamond_search_sadx4;
 $vp8_diamond_search_sad_msa=vp8_diamond_search_sadx4;
+$vp8_diamond_search_sad_lsx=vp8_diamond_search_sadx4;
 
 #
 # Alt-ref Noise Reduction (ARNR)
diff --git a/libvpx/vp8/common/x86/bilinear_filter_sse2.c b/libvpx/vp8/common/x86/bilinear_filter_sse2.c
index 9bf65d8..ff6cbbd 100644
--- a/libvpx/vp8/common/x86/bilinear_filter_sse2.c
+++ b/libvpx/vp8/common/x86/bilinear_filter_sse2.c
@@ -313,10 +313,10 @@
       const __m128i compensated = _mm_add_epi16(sum, round_factor);
       const __m128i shifted = _mm_srai_epi16(compensated, VP8_FILTER_SHIFT);
       __m128i packed = _mm_packus_epi16(shifted, shifted);
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       packed = _mm_srli_si128(packed, 4);
       dst += stride;
-      storeu_uint32(dst, _mm_cvtsi128_si32(packed));
+      storeu_int32(dst, _mm_cvtsi128_si32(packed));
       dst += stride;
       src += 8;
     }
diff --git a/libvpx/vp8/decoder/decodeframe.c b/libvpx/vp8/decoder/decodeframe.c
index 67c254f..1c15667 100644
--- a/libvpx/vp8/decoder/decodeframe.c
+++ b/libvpx/vp8/decoder/decodeframe.c
@@ -872,8 +872,8 @@
   xd->mode_info_stride = pc->mode_info_stride;
   xd->corrupted = 0; /* init without corruption */
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (pc->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (pc->full_pixel) xd->fullpixel_mask = ~7;
 }
 
 int vp8_decode_frame(VP8D_COMP *pbi) {
diff --git a/libvpx/vp8/decoder/decodemv.c b/libvpx/vp8/decoder/decodemv.c
index 9437385..51817a2 100644
--- a/libvpx/vp8/decoder/decodemv.c
+++ b/libvpx/vp8/decoder/decodemv.c
@@ -173,7 +173,8 @@
   { 208, 1, 1 }     /* SUBMVREF_LEFT_ABOVE_ZED  */
 };
 
-static const vp8_prob *get_sub_mv_ref_prob(const int left, const int above) {
+static const vp8_prob *get_sub_mv_ref_prob(const uint32_t left,
+                                           const uint32_t above) {
   int lez = (left == 0);
   int aez = (above == 0);
   int lea = (left == above);
diff --git a/libvpx/vp8/decoder/threading.c b/libvpx/vp8/decoder/threading.c
index 491e2ce..490f62d 100644
--- a/libvpx/vp8/decoder/threading.c
+++ b/libvpx/vp8/decoder/threading.c
@@ -74,9 +74,9 @@
     memcpy(mbd->dequant_y2, xd->dequant_y2, sizeof(xd->dequant_y2));
     memcpy(mbd->dequant_uv, xd->dequant_uv, sizeof(xd->dequant_uv));
 
-    mbd->fullpixel_mask = 0xffffffff;
+    mbd->fullpixel_mask = ~0;
 
-    if (pc->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    if (pc->full_pixel) mbd->fullpixel_mask = ~7;
   }
 
   for (i = 0; i < pc->mb_rows; ++i)
diff --git a/libvpx/vp8/encoder/bitstream.c b/libvpx/vp8/encoder/bitstream.c
index 87825fa..0e97af5 100644
--- a/libvpx/vp8/encoder/bitstream.c
+++ b/libvpx/vp8/encoder/bitstream.c
@@ -172,9 +172,8 @@
         validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
         w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-        lowvalue <<= offset;
         shift = count;
-        lowvalue &= 0xffffff;
+        lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
         count -= 8;
       }
 
@@ -223,9 +222,8 @@
             validate_buffer(w->buffer + w->pos, 1, w->buffer_end, w->error);
 
             w->buffer[w->pos++] = (lowvalue >> (24 - offset)) & 0xff;
-            lowvalue <<= offset;
             shift = count;
-            lowvalue &= 0xffffff;
+            lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
             count -= 8;
           }
 
diff --git a/libvpx/vp8/encoder/boolhuff.h b/libvpx/vp8/encoder/boolhuff.h
index 8cc61bd..a8c536b 100644
--- a/libvpx/vp8/encoder/boolhuff.h
+++ b/libvpx/vp8/encoder/boolhuff.h
@@ -94,9 +94,8 @@
     validate_buffer(bc->buffer + bc->pos, 1, bc->buffer_end, bc->error);
     bc->buffer[bc->pos++] = (lowvalue >> (24 - offset) & 0xff);
 
-    lowvalue <<= offset;
     shift = count;
-    lowvalue &= 0xffffff;
+    lowvalue = (int)(((uint64_t)lowvalue << offset) & 0xffffff);
     count -= 8;
   }
 
diff --git a/libvpx/vp8/encoder/encodeframe.c b/libvpx/vp8/encoder/encodeframe.c
index 2f84381..6201075 100644
--- a/libvpx/vp8/encoder/encodeframe.c
+++ b/libvpx/vp8/encoder/encodeframe.c
@@ -69,10 +69,9 @@
                                                 128, 128, 128, 128 };
 
 /* Original activity measure from Tim T's code. */
-static unsigned int tt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x) {
+static unsigned int tt_activity_measure(MACROBLOCK *x) {
   unsigned int act;
   unsigned int sse;
-  (void)cpi;
   /* TODO: This could also be done over smaller areas (8x8), but that would
    *  require extensive changes elsewhere, as lambda is assumed to be fixed
    *  over an entire MB in most of the code.
@@ -90,28 +89,21 @@
   return act;
 }
 
-/* Stub for alternative experimental activity measures. */
-static unsigned int alt_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                         int use_dc_pred) {
-  return vp8_encode_intra(cpi, x, use_dc_pred);
-}
-
 /* Measure the activity of the current macroblock
  * What we measure here is TBD so abstracted to this function
  */
 #define ALT_ACT_MEASURE 1
-static unsigned int mb_activity_measure(VP8_COMP *cpi, MACROBLOCK *x,
-                                        int mb_row, int mb_col) {
+static unsigned int mb_activity_measure(MACROBLOCK *x, int mb_row, int mb_col) {
   unsigned int mb_activity;
 
   if (ALT_ACT_MEASURE) {
     int use_dc_pred = (mb_col || mb_row) && (!mb_col || !mb_row);
 
-    /* Or use and alternative. */
-    mb_activity = alt_activity_measure(cpi, x, use_dc_pred);
+    /* Or use an alternative. */
+    mb_activity = vp8_encode_intra(x, use_dc_pred);
   } else {
     /* Original activity measure from Tim T's code. */
-    mb_activity = tt_activity_measure(cpi, x);
+    mb_activity = tt_activity_measure(x);
   }
 
   if (mb_activity < VP8_ACTIVITY_AVG_MIN) mb_activity = VP8_ACTIVITY_AVG_MIN;
@@ -264,7 +256,7 @@
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* measure activity */
-      mb_activity = mb_activity_measure(cpi, x, mb_row, mb_col);
+      mb_activity = mb_activity_measure(x, mb_row, mb_col);
 
       /* Keep frame sum */
       activity_sum += mb_activity;
@@ -634,12 +626,13 @@
                              cpi->prob_last_coded, cpi->prob_gf_coded);
   }
 
-  xd->fullpixel_mask = 0xffffffff;
-  if (cm->full_pixel) xd->fullpixel_mask = 0xfffffff8;
+  xd->fullpixel_mask = ~0;
+  if (cm->full_pixel) xd->fullpixel_mask = ~7;
 
   vp8_zero(x->coef_counts);
   vp8_zero(x->ymode_count);
-  vp8_zero(x->uv_mode_count) x->prediction_error = 0;
+  vp8_zero(x->uv_mode_count);
+  x->prediction_error = 0;
   x->intra_error = 0;
   vp8_zero(x->count_mb_ref_frame_usage);
 }
@@ -766,12 +759,12 @@
 
       for (mb_row = 0; mb_row < cm->mb_rows;
            mb_row += (cpi->encoding_thread_count + 1)) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #else
-            tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
+        tp = cpi->tok + mb_row * (cm->mb_cols * 16 * 24);
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
@@ -858,10 +851,10 @@
 
       /* for each macroblock row in image */
       for (mb_row = 0; mb_row < cm->mb_rows; ++mb_row) {
-        vp8_zero(cm->left_context)
+        vp8_zero(cm->left_context);
 
 #if CONFIG_REALTIME_ONLY & CONFIG_ONTHEFLY_BITPACKING
-            tp = cpi->tok;
+        tp = cpi->tok;
 #endif
 
         encode_mb_row(cpi, cm, mb_row, x, xd, &tp, segment_counts, &totalrate);
diff --git a/libvpx/vp8/encoder/encodeintra.c b/libvpx/vp8/encoder/encodeintra.c
index f89e7cb..7d448c0 100644
--- a/libvpx/vp8/encoder/encodeintra.c
+++ b/libvpx/vp8/encoder/encodeintra.c
@@ -18,10 +18,9 @@
 #include "vp8/common/invtrans.h"
 #include "encodeintra.h"
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred) {
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred) {
   int i;
   int intra_pred_var = 0;
-  (void)cpi;
 
   if (use_dc_pred) {
     x->e_mbd.mode_info_context->mbmi.mode = DC_PRED;
diff --git a/libvpx/vp8/encoder/encodeintra.h b/libvpx/vp8/encoder/encodeintra.h
index 021dc5e..9a378ab 100644
--- a/libvpx/vp8/encoder/encodeintra.h
+++ b/libvpx/vp8/encoder/encodeintra.h
@@ -16,7 +16,7 @@
 extern "C" {
 #endif
 
-int vp8_encode_intra(VP8_COMP *cpi, MACROBLOCK *x, int use_dc_pred);
+int vp8_encode_intra(MACROBLOCK *x, int use_dc_pred);
 void vp8_encode_intra16x16mby(MACROBLOCK *x);
 void vp8_encode_intra16x16mbuv(MACROBLOCK *x);
 void vp8_encode_intra4x4mby(MACROBLOCK *mb);
diff --git a/libvpx/vp8/encoder/encodemv.c b/libvpx/vp8/encoder/encodemv.c
index 04adf10..c88ea16 100644
--- a/libvpx/vp8/encoder/encodemv.c
+++ b/libvpx/vp8/encoder/encodemv.c
@@ -160,7 +160,7 @@
   const unsigned int tot = ct[0] + ct[1];
 
   if (tot) {
-    const vp8_prob x = ((ct[0] * 255) / tot) & -2;
+    const vp8_prob x = ((ct[0] * 255) / tot) & ~1u;
     *p = x ? x : 1;
   }
 }
@@ -205,8 +205,11 @@
   (void)rc;
   vp8_copy_array(Pnew, default_mvc, MVPcount);
 
-  vp8_zero(is_short_ct) vp8_zero(sign_ct) vp8_zero(bit_ct) vp8_zero(short_ct)
-      vp8_zero(short_bct)
+  vp8_zero(is_short_ct);
+  vp8_zero(sign_ct);
+  vp8_zero(bit_ct);
+  vp8_zero(short_ct);
+  vp8_zero(short_bct);
 
   /* j=0 */
   {
diff --git a/libvpx/vp8/encoder/ethreading.c b/libvpx/vp8/encoder/ethreading.c
index 55a1528..cb35f4f 100644
--- a/libvpx/vp8/encoder/ethreading.c
+++ b/libvpx/vp8/encoder/ethreading.c
@@ -470,8 +470,8 @@
 
     setup_mbby_copy(&mbr_ei[i].mb, x);
 
-    mbd->fullpixel_mask = 0xffffffff;
-    if (cm->full_pixel) mbd->fullpixel_mask = 0xfffffff8;
+    mbd->fullpixel_mask = ~0;
+    if (cm->full_pixel) mbd->fullpixel_mask = ~7;
 
     vp8_zero(mb->coef_counts);
     vp8_zero(x->ymode_count);
diff --git a/libvpx/vp8/encoder/firstpass.c b/libvpx/vp8/encoder/firstpass.c
index 981c0fd..ed177e3 100644
--- a/libvpx/vp8/encoder/firstpass.c
+++ b/libvpx/vp8/encoder/firstpass.c
@@ -567,7 +567,7 @@
       vp8_copy_mem16x16(x->src.y_buffer, x->src.y_stride, x->thismb, 16);
 
       /* do intra 16x16 prediction */
-      this_error = vp8_encode_intra(cpi, x, use_dc_pred);
+      this_error = vp8_encode_intra(x, use_dc_pred);
 
       /* "intrapenalty" below deals with situations where the intra
        * and inter error scores are very low (eg a plain black frame)
@@ -1631,7 +1631,6 @@
   double this_frame_mv_in_out = 0.0;
   double mv_in_out_accumulator = 0.0;
   double abs_mv_in_out_accumulator = 0.0;
-  double mod_err_per_mb_accumulator = 0.0;
 
   int max_bits = frame_max_bits(cpi); /* Max for a single frame */
 
@@ -1682,9 +1681,6 @@
 
     gf_group_err += mod_frame_err;
 
-    mod_err_per_mb_accumulator +=
-        mod_frame_err / DOUBLE_DIVIDE_CHECK((double)cpi->common.MBs);
-
     if (EOF == input_stats(cpi, &next_frame)) break;
 
     /* Test for the case where there is a brief flash but the prediction
diff --git a/libvpx/vp8/encoder/lookahead.c b/libvpx/vp8/encoder/lookahead.c
index 37aa9ee..49f851d 100644
--- a/libvpx/vp8/encoder/lookahead.c
+++ b/libvpx/vp8/encoder/lookahead.c
@@ -66,8 +66,8 @@
   depth += 1;
 
   /* Align the buffer dimensions */
-  width = (width + 15) & ~15;
-  height = (height + 15) & ~15;
+  width = (width + 15) & ~15u;
+  height = (height + 15) & ~15u;
 
   /* Allocate the lookahead structures */
   ctx = calloc(1, sizeof(*ctx));
diff --git a/libvpx/vp8/encoder/loongarch/dct_lsx.c b/libvpx/vp8/encoder/loongarch/dct_lsx.c
new file mode 100644
index 0000000..a08d4d3
--- /dev/null
+++ b/libvpx/vp8/encoder/loongarch/dct_lsx.c
@@ -0,0 +1,161 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  }
+
+#define SET_DOTP_VALUES(coeff, val0, val1, val2, const1, const2)           \
+  {                                                                        \
+    __m128i tmp0_m, tmp1_m, tmp2_m;                                        \
+                                                                           \
+    tmp0_m = __lsx_vreplvei_h(coeff, val0);                                \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff, val1, coeff, val2, tmp1_m, tmp2_m); \
+    DUP2_ARG2(__lsx_vpackev_h, tmp1_m, tmp0_m, tmp0_m, tmp2_m, const1,     \
+              const2);                                                     \
+  }
+
+#define RET_1_IF_NZERO_H(_in)           \
+  ({                                    \
+    __m128i tmp_m;                      \
+    __m128i one_m = __lsx_vldi(0x401);  \
+    __m128i max_m = __lsx_vldi(0xFF);   \
+                                        \
+    tmp_m = __lsx_vseqi_h(_in, 0);      \
+    tmp_m = __lsx_vxor_v(tmp_m, max_m); \
+    tmp_m = __lsx_vand_v(tmp_m, one_m); \
+                                        \
+    tmp_m;                              \
+  })
+
+void vp8_short_fdct4x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3;
+  __m128i tmp0, tmp1, tmp2, tmp3, const0, const1;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i out0, out1, out2, out3;
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, tmp0, 3, tmp1, 3, in1, 3, in3, 3, tmp0, tmp1, in1,
+            in3);
+  in0 = __lsx_vadd_h(tmp0, tmp1);
+  in2 = __lsx_vsub_h(tmp0, tmp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const0, const1);
+  tmp0 = __lsx_vilvl_h(in3, in1);
+  in1 = __lsx_vreplvei_h(coeff, 3);
+  out0 = __lsx_vpackev_h(zero, in1);
+  coeff = __lsx_vilvl_h(zero, coeff);
+  out1 = __lsx_vreplvei_w(coeff, 0);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out0, tmp0, const0, out1, tmp0, const1, out0,
+            out1);
+  DUP2_ARG3(__lsx_vsrani_h_w, out0, out0, 12, out1, out1, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, tmp0, tmp1, in1, in3);
+  tmp2 = __lsx_vadd_h(tmp0, tmp1);
+  tmp3 = __lsx_vsub_h(tmp0, tmp1);
+  DUP2_ARG2(__lsx_vaddi_hu, tmp2, 7, tmp3, 7, in0, in2);
+  DUP2_ARG2(__lsx_vsrai_h, in0, 4, in2, 4, in0, in2);
+  DUP2_ARG2(__lsx_vilvl_h, zero, in0, zero, in2, out0, out2);
+  tmp1 = RET_1_IF_NZERO_H(in3);
+  DUP2_ARG2(__lsx_vilvl_h, zero, tmp1, in3, in1, tmp1, tmp0);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, out3, out1);
+  out3 = __lsx_vadd_w(out3, out1);
+  out1 = __lsx_vreplvei_w(coeff, 1);
+  DUP2_ARG3(__lsx_vdp2add_w_h, out1, tmp0, const0, out3, tmp0, const1, out1,
+            out3);
+  DUP2_ARG2(__lsx_vsrai_w, out1, 16, out3, 16, out1, out3);
+  out1 = __lsx_vadd_w(out1, tmp1);
+  DUP2_ARG2(__lsx_vpickev_h, out1, out0, out3, out2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vp8_short_fdct8x4_lsx(int16_t *input, int16_t *output, int32_t pitch) {
+  __m128i in0, in1, in2, in3, temp0, temp1, tmp0, tmp1;
+  __m128i const0, const1, const2, vec0_w, vec1_w, vec2_w, vec3_w;
+  __m128i coeff = { 0x38a4eb1814e808a9, 0x659061a82ee01d4c };
+  __m128i zero = __lsx_vldi(0);
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, pitch, input, pitch2, in1, in2);
+  in3 = __lsx_vldx(input, pitch3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  DUP4_ARG2(__lsx_vslli_h, temp0, 3, temp1, 3, in1, 3, in3, 3, temp0, temp1,
+            in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  SET_DOTP_VALUES(coeff, 0, 1, 2, const1, const2);
+  temp0 = __lsx_vreplvei_h(coeff, 3);
+  vec1_w = __lsx_vpackev_h(zero, temp0);
+  coeff = __lsx_vilvh_h(zero, coeff);
+  vec3_w = __lsx_vreplvei_w(coeff, 0);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 12, vec3_w, vec2_w, 12, in1, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, temp1, in1, in3);
+  in0 = __lsx_vadd_h(temp0, temp1);
+  in0 = __lsx_vaddi_hu(in0, 7);
+  in2 = __lsx_vsub_h(temp0, temp1);
+  in2 = __lsx_vaddi_hu(in2, 7);
+  in0 = __lsx_vsrai_h(in0, 4);
+  in2 = __lsx_vsrai_h(in2, 4);
+  DUP2_ARG2(__lsx_vreplvei_w, coeff, 2, coeff, 3, vec3_w, vec1_w);
+  vec3_w = __lsx_vadd_w(vec3_w, vec1_w);
+  vec1_w = __lsx_vreplvei_w(coeff, 1);
+  const0 = RET_1_IF_NZERO_H(in3);
+  tmp1 = __lsx_vilvl_h(in3, in1);
+  tmp0 = __lsx_vilvh_h(in3, in1);
+  vec0_w = vec1_w;
+  vec2_w = vec3_w;
+  DUP4_ARG3(__lsx_vdp2add_w_h, vec0_w, tmp1, const1, vec1_w, tmp0, const1,
+            vec2_w, tmp1, const2, vec3_w, tmp0, const2, vec0_w, vec1_w, vec2_w,
+            vec3_w);
+  DUP2_ARG3(__lsx_vsrani_h_w, vec1_w, vec0_w, 16, vec3_w, vec2_w, 16, in1, in3);
+  in1 = __lsx_vadd_h(in1, const0);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, temp0, temp1);
+  __lsx_vst(temp0, output, 0);
+  __lsx_vst(temp1, output, 16);
+
+  DUP2_ARG2(__lsx_vpickod_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in2, output, 48);
+}
diff --git a/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c b/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c
new file mode 100644
index 0000000..4ad4cab
--- /dev/null
+++ b/libvpx/vp8/encoder/loongarch/encodeopt_lsx.c
@@ -0,0 +1,82 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+int32_t vp8_block_error_lsx(int16_t *coeff_ptr, int16_t *dq_coeff_ptr) {
+  int32_t err = 0;
+  __m128i dq_coeff0, dq_coeff1, coeff0, coeff1;
+  __m128i reg0, reg1, reg2, reg3, error;
+
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, dq_coeff_ptr, 0,
+            dq_coeff_ptr, 16, coeff0, coeff1, dq_coeff0, dq_coeff1);
+  DUP2_ARG2(__lsx_vsubwev_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg0,
+            reg2);
+  DUP2_ARG2(__lsx_vsubwod_w_h, coeff0, dq_coeff0, coeff1, dq_coeff1, reg1,
+            reg3);
+  error = __lsx_vmul_w(reg0, reg0);
+  DUP2_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, error);
+  error = __lsx_vmadd_w(error, reg3, reg3);
+  error = __lsx_vhaddw_d_w(error, error);
+  err = __lsx_vpickve2gr_w(error, 0);
+  err += __lsx_vpickve2gr_w(error, 2);
+  return err;
+}
+
+int32_t vp8_mbblock_error_lsx(MACROBLOCK *mb, int32_t dc) {
+  BLOCK *be;
+  BLOCKD *bd;
+  int16_t *coeff, *dq_coeff;
+  int32_t err = 0;
+  uint32_t loop_cnt;
+  __m128i src0, src1, src2, src3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7, error;
+  __m128i mask0 = __lsx_vldi(0xFF);
+  __m128i zero = __lsx_vldi(0);
+
+  if (dc == 1) {
+    mask0 = __lsx_vinsgr2vr_w(mask0, 0, 0);
+  }
+
+  for (loop_cnt = 0; loop_cnt < 8; loop_cnt++) {
+    int32_t loop_tmp = loop_cnt << 1;
+    be = &mb->block[loop_tmp];
+    bd = &mb->e_mbd.block[loop_tmp];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src0,
+              src1, tmp0, tmp1);
+    be = &mb->block[loop_tmp + 1];
+    bd = &mb->e_mbd.block[loop_tmp + 1];
+    coeff = be->coeff;
+    dq_coeff = bd->dqcoeff;
+    DUP4_ARG2(__lsx_vld, coeff, 0, coeff, 16, dq_coeff, 0, dq_coeff, 16, src2,
+              src3, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vsubwev_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vsubwod_w_h, src0, tmp0, src1, tmp1, src2, tmp2, src3, tmp3,
+              reg1, reg3, reg5, reg7);
+    DUP2_ARG3(__lsx_vbitsel_v, zero, reg0, mask0, zero, reg4, mask0, reg0,
+              reg4);
+    error = __lsx_vmul_w(reg0, reg0);
+    DUP4_ARG3(__lsx_vmadd_w, error, reg1, reg1, error, reg2, reg2, error, reg3,
+              reg3, error, reg4, reg4, error, error, error, error);
+    DUP2_ARG3(__lsx_vmadd_w, error, reg5, reg5, error, reg6, reg6, error,
+              error);
+    error = __lsx_vmadd_w(error, reg7, reg7);
+    error = __lsx_vhaddw_d_w(error, error);
+    error = __lsx_vhaddw_q_d(error, error);
+    err += __lsx_vpickve2gr_w(error, 0);
+  }
+  return err;
+}
diff --git a/libvpx/vp8/encoder/loongarch/quantize_lsx.c b/libvpx/vp8/encoder/loongarch/quantize_lsx.c
new file mode 100644
index 0000000..7588919
--- /dev/null
+++ b/libvpx/vp8/encoder/loongarch/quantize_lsx.c
@@ -0,0 +1,145 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdint.h>
+#include "./vp8_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+#include "vp8/encoder/block.h"
+
+#define BOOST_QUANT1(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui;                                        \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+#define BOOST_QUANT2(_in0, _in1, _in2, _ui)               \
+  {                                                       \
+    if (boost_temp[0] <= __lsx_vpickve2gr_h(_in0, _ui)) { \
+      if (__lsx_vpickve2gr_h(_in1, _ui)) {                \
+        eob = _ui + 8;                                    \
+        boost_temp = zbin_boost;                          \
+      } else {                                            \
+        boost_temp++;                                     \
+      }                                                   \
+    } else {                                              \
+      _in2 = __lsx_vinsgr2vr_h(_in2, 0, _ui);             \
+      boost_temp++;                                       \
+    }                                                     \
+  }
+
+static int8_t exact_regular_quantize_b_lsx(
+    int16_t *zbin_boost, int16_t *coeff_ptr, int16_t *zbin, int16_t *round,
+    int16_t *quant, int16_t *quant_shift, int16_t *de_quant, int16_t zbin_oq_in,
+    int16_t *q_coeff, int16_t *dq_coeff) {
+  int32_t eob;
+  int16_t *boost_temp = zbin_boost;
+  __m128i inv_zig_zag = { 0x0C07040206050100, 0x0F0E0A090D0B0803 };
+  __m128i sign_z0, sign_z1, q_coeff0, q_coeff1;
+  __m128i z_bin0, z_bin1, zbin_o_q, x0, x1, sign_x0, sign_x1, de_quant0,
+      de_quant1;
+  __m128i z0, z1, round0, round1, quant0, quant2;
+  __m128i inv_zig_zag0, inv_zig_zag1;
+  __m128i zigzag_mask0 = { 0x0008000400010000, 0x0006000300020005 };
+  __m128i zigzag_mask1 = { 0x000A000D000C0009, 0X000F000E000B0007 };
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i zero = __lsx_vldi(0);
+
+  zbin_o_q = __lsx_vreplgr2vr_h(zbin_oq_in);
+  inv_zig_zag0 = __lsx_vilvl_b(zero, inv_zig_zag);
+  inv_zig_zag1 = __lsx_vilvh_b(zero, inv_zig_zag);
+  eob = -1;
+  DUP4_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, round, 0, round, 16, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, z0, z1, round0,
+            round1);
+  DUP4_ARG2(__lsx_vld, quant, 0, quant, 16, zbin, 0, zbin, 16, tmp0, tmp1, tmp2,
+            tmp3);
+  DUP4_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp1, tmp0, zigzag_mask1, tmp1, tmp0,
+            zigzag_mask0, tmp3, tmp2, zigzag_mask1, tmp3, tmp2, quant0, quant2,
+            z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsrai_h, z0, 15, z1, 15, sign_z0, sign_z1);
+  DUP2_ARG2(__lsx_vadda_h, z0, zero, z1, zero, x0, x1);
+  DUP2_ARG2(__lsx_vsub_h, x0, z_bin0, x1, z_bin1, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vsub_h, z_bin0, zbin_o_q, z_bin1, zbin_o_q, z_bin0, z_bin1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, round0, quant2, round1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, round0, quant2, round1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, q_coeff0, q_coeff1);
+
+  DUP2_ARG2(__lsx_vld, quant_shift, 0, quant_shift, 16, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vshuf_h, zigzag_mask0, tmp3, tmp1, zigzag_mask1, tmp3, tmp1,
+            quant0, quant2);
+  DUP2_ARG2(__lsx_vadd_h, x0, round0, x1, round1, x0, x1);
+  DUP2_ARG2(__lsx_vmulwev_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vmulwod_w_h, quant0, q_coeff0, quant2, q_coeff1, tmp1, tmp3);
+  DUP2_ARG3(__lsx_vmaddwev_w_h, tmp0, quant0, x0, tmp2, quant2, x1, tmp0, tmp2);
+  DUP2_ARG3(__lsx_vmaddwod_w_h, tmp1, quant0, x0, tmp3, quant2, x1, tmp1, tmp3);
+  DUP2_ARG2(__lsx_vpackod_h, tmp1, tmp0, tmp3, tmp2, x0, x1);
+  DUP2_ARG2(__lsx_vxor_v, x0, sign_z0, x1, sign_z1, sign_x0, sign_x1);
+  DUP2_ARG2(__lsx_vsub_h, sign_x0, sign_z0, sign_x1, sign_z1, sign_x0, sign_x1);
+
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 0);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 1);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 2);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 3);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 4);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 5);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 6);
+  BOOST_QUANT1(z_bin0, x0, sign_x0, 7);
+
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 0);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 1);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 2);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 3);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 4);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 5);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 6);
+  BOOST_QUANT2(z_bin1, x1, sign_x1, 7);
+
+  DUP2_ARG2(__lsx_vld, de_quant, 0, de_quant, 16, de_quant0, de_quant1);
+  DUP2_ARG3(__lsx_vshuf_h, inv_zig_zag0, sign_x1, sign_x0, inv_zig_zag1,
+            sign_x1, sign_x0, q_coeff0, q_coeff1);
+  DUP2_ARG2(__lsx_vmul_h, de_quant0, q_coeff0, de_quant1, q_coeff1, de_quant0,
+            de_quant1);
+  __lsx_vst(q_coeff0, q_coeff, 0);
+  __lsx_vst(q_coeff1, q_coeff, 16);
+  __lsx_vst(de_quant0, dq_coeff, 0);
+  __lsx_vst(de_quant1, dq_coeff, 16);
+
+  return (int8_t)(eob + 1);
+}
+
+void vp8_regular_quantize_b_lsx(BLOCK *b, BLOCKD *d) {
+  int16_t *zbin_boost_ptr = b->zrun_zbin_boost;
+  int16_t *coeff_ptr = b->coeff;
+  int16_t *zbin_ptr = b->zbin;
+  int16_t *round_ptr = b->round;
+  int16_t *quant_ptr = b->quant;
+  int16_t *quant_shift_ptr = b->quant_shift;
+  int16_t *qcoeff_ptr = d->qcoeff;
+  int16_t *dqcoeff_ptr = d->dqcoeff;
+  int16_t *dequant_ptr = d->dequant;
+  int16_t zbin_oq_value = b->zbin_extra;
+
+  *d->eob = exact_regular_quantize_b_lsx(
+      zbin_boost_ptr, coeff_ptr, zbin_ptr, round_ptr, quant_ptr,
+      quant_shift_ptr, dequant_ptr, zbin_oq_value, qcoeff_ptr, dqcoeff_ptr);
+}
diff --git a/libvpx/vp8/encoder/mcomp.c b/libvpx/vp8/encoder/mcomp.c
index 9e7f5c7..ae092c6 100644
--- a/libvpx/vp8/encoder/mcomp.c
+++ b/libvpx/vp8/encoder/mcomp.c
@@ -204,19 +204,21 @@
 /* returns distortion + motion vector cost */
 #define ERR(r, c) (MVC(r, c) + DIST(r, c))
 /* checks if (r,c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                           \
-  IFMVCV(r, c,                                          \
-         {                                              \
-           thismse = DIST(r, c);                        \
-           if ((v = (MVC(r, c) + thismse)) < besterr) { \
-             besterr = v;                               \
-             br = r;                                    \
-             bc = c;                                    \
-             *distortion = thismse;                     \
-             *sse1 = sse;                               \
-           }                                            \
-         },                                             \
-         v = UINT_MAX;)
+#define CHECK_BETTER(v, r, c)                             \
+  do {                                                    \
+    IFMVCV(r, c,                                          \
+           {                                              \
+             thismse = DIST(r, c);                        \
+             if ((v = (MVC(r, c) + thismse)) < besterr) { \
+               besterr = v;                               \
+               br = r;                                    \
+               bc = c;                                    \
+               *distortion = thismse;                     \
+               *sse1 = sse;                               \
+             }                                            \
+           },                                             \
+           v = UINT_MAX;)                                 \
+  } while (0)
 
 int vp8_find_best_sub_pixel_step_iteratively(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                              int_mv *bestmv, int_mv *ref_mv,
@@ -800,13 +802,13 @@
 }
 
 #define CHECK_BOUNDS(range)                    \
-  {                                            \
+  do {                                         \
     all_in = 1;                                \
     all_in &= ((br - range) >= x->mv_row_min); \
     all_in &= ((br + range) <= x->mv_row_max); \
     all_in &= ((bc - range) >= x->mv_col_min); \
     all_in &= ((bc + range) <= x->mv_col_max); \
-  }
+  } while (0)
 
 #define CHECK_POINT                                  \
   {                                                  \
@@ -817,7 +819,7 @@
   }
 
 #define CHECK_BETTER                                                     \
-  {                                                                      \
+  do {                                                                   \
     if (thissad < bestsad) {                                             \
       thissad +=                                                         \
           mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit); \
@@ -826,7 +828,7 @@
         best_site = i;                                                   \
       }                                                                  \
     }                                                                    \
-  }
+  } while (0)
 
 static const MV next_chkpts[6][3] = {
   { { -2, 0 }, { -1, -2 }, { 1, -2 } }, { { -1, -2 }, { 1, -2 }, { 2, 0 } },
@@ -901,7 +903,7 @@
 #endif
 
   /* hex search */
-  CHECK_BOUNDS(2)
+  CHECK_BOUNDS(2);
 
   if (all_in) {
     for (i = 0; i < 6; ++i) {
@@ -910,7 +912,7 @@
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   } else {
     for (i = 0; i < 6; ++i) {
@@ -920,7 +922,7 @@
       this_offset = base_offset + (this_mv.as_mv.row * in_what_stride) +
                     this_mv.as_mv.col;
       thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-      CHECK_BETTER
+      CHECK_BETTER;
     }
   }
 
@@ -934,7 +936,7 @@
 
   for (j = 1; j < hex_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(2)
+    CHECK_BOUNDS(2);
 
     if (all_in) {
       for (i = 0; i < 3; ++i) {
@@ -943,7 +945,7 @@
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 3; ++i) {
@@ -953,7 +955,7 @@
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -975,7 +977,7 @@
 cal_neighbors:
   for (j = 0; j < dia_range; ++j) {
     best_site = -1;
-    CHECK_BOUNDS(1)
+    CHECK_BOUNDS(1);
 
     if (all_in) {
       for (i = 0; i < 4; ++i) {
@@ -984,7 +986,7 @@
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     } else {
       for (i = 0; i < 4; ++i) {
@@ -994,7 +996,7 @@
         this_offset = base_offset + (this_mv.as_mv.row * (in_what_stride)) +
                       this_mv.as_mv.col;
         thissad = vfp->sdf(what, what_stride, this_offset, in_what_stride);
-        CHECK_BETTER
+        CHECK_BETTER;
       }
     }
 
@@ -1127,7 +1129,7 @@
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
 
-#if HAVE_SSE2 || HAVE_MSA
+#if HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 int vp8_diamond_search_sadx4(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
                              int_mv *best_mv, int search_param, int sad_per_bit,
                              int *num00, vp8_variance_fn_ptr_t *fn_ptr,
@@ -1276,12 +1278,12 @@
   return fn_ptr->vf(what, what_stride, best_address, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE2 || HAVE_MSA
+#endif  // HAVE_SSE2 || HAVE_MSA || HAVE_LSX
 
-int vp8_full_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv) {
   unsigned char *what = (*(b->base_src) + b->src);
   int what_stride = b->src_stride;
   unsigned char *in_what;
@@ -1325,8 +1327,8 @@
   bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
             mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
 
-  /* Apply further limits to prevent us looking using vectors that
-   * stretch beyiond the UMV border
+  /* Apply further limits to prevent us looking using vectors that stretch
+   * beyond the UMV border
    */
   if (col_min < x->mv_col_min) col_min = x->mv_col_min;
 
@@ -1343,122 +1345,6 @@
     for (c = col_min; c < col_max; ++c) {
       thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
 
-      this_mv.as_mv.col = c;
-      thissad += mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-      if (thissad < bestsad) {
-        bestsad = thissad;
-        best_mv->as_mv.row = r;
-        best_mv->as_mv.col = c;
-        bestaddress = check_here;
-      }
-
-      check_here++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-
-#if HAVE_SSSE3
-int vp8_full_search_sadx3(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  unsigned char *in_what;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
       if (thissad < bestsad) {
         this_mv.as_mv.col = c;
         thissad +=
@@ -1473,155 +1359,6 @@
       }
 
       check_here++;
-      c++;
-    }
-  }
-
-  this_mv.as_mv.row = best_mv->as_mv.row << 3;
-  this_mv.as_mv.col = best_mv->as_mv.col << 3;
-
-  return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
-         mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
-}
-#endif  // HAVE_SSSE3
-
-#if HAVE_SSE4_1
-int vp8_full_search_sadx8(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
-                          int sad_per_bit, int distance,
-                          vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
-                          int_mv *center_mv) {
-  unsigned char *what = (*(b->base_src) + b->src);
-  int what_stride = b->src_stride;
-  int pre_stride = x->e_mbd.pre.y_stride;
-  unsigned char *base_pre = x->e_mbd.pre.y_buffer;
-  unsigned char *in_what;
-  int in_what_stride = pre_stride;
-  int mv_stride = pre_stride;
-  unsigned char *bestaddress;
-  int_mv *best_mv = &d->bmi.mv;
-  int_mv this_mv;
-  unsigned int bestsad;
-  unsigned int thissad;
-  int r, c;
-
-  unsigned char *check_here;
-
-  int ref_row = ref_mv->as_mv.row;
-  int ref_col = ref_mv->as_mv.col;
-
-  int row_min = ref_row - distance;
-  int row_max = ref_row + distance;
-  int col_min = ref_col - distance;
-  int col_max = ref_col + distance;
-
-  DECLARE_ALIGNED(16, unsigned int, sad_array8[8]);
-  unsigned int sad_array[3];
-
-  int *mvsadcost[2];
-  int_mv fcenter_mv;
-
-  mvsadcost[0] = x->mvsadcost[0];
-  mvsadcost[1] = x->mvsadcost[1];
-  fcenter_mv.as_mv.row = center_mv->as_mv.row >> 3;
-  fcenter_mv.as_mv.col = center_mv->as_mv.col >> 3;
-
-  /* Work out the mid point for the search */
-  in_what = base_pre + d->offset;
-  bestaddress = in_what + (ref_row * pre_stride) + ref_col;
-
-  best_mv->as_mv.row = ref_row;
-  best_mv->as_mv.col = ref_col;
-
-  /* Baseline value at the centre */
-  bestsad = fn_ptr->sdf(what, what_stride, bestaddress, in_what_stride) +
-            mvsad_err_cost(best_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-  /* Apply further limits to prevent us looking using vectors that stretch
-   * beyond the UMV border
-   */
-  if (col_min < x->mv_col_min) col_min = x->mv_col_min;
-
-  if (col_max > x->mv_col_max) col_max = x->mv_col_max;
-
-  if (row_min < x->mv_row_min) row_min = x->mv_row_min;
-
-  if (row_max > x->mv_row_max) row_max = x->mv_row_max;
-
-  for (r = row_min; r < row_max; ++r) {
-    this_mv.as_mv.row = r;
-    check_here = r * mv_stride + in_what + col_min;
-    c = col_min;
-
-    while ((c + 7) < col_max) {
-      int i;
-
-      fn_ptr->sdx8f(what, what_stride, check_here, in_what_stride, sad_array8);
-
-      for (i = 0; i < 8; ++i) {
-        thissad = sad_array8[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while ((c + 2) < col_max) {
-      int i;
-
-      fn_ptr->sdx3f(what, what_stride, check_here, in_what_stride, sad_array);
-
-      for (i = 0; i < 3; ++i) {
-        thissad = sad_array[i];
-
-        if (thissad < bestsad) {
-          this_mv.as_mv.col = c;
-          thissad +=
-              mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-          if (thissad < bestsad) {
-            bestsad = thissad;
-            best_mv->as_mv.row = r;
-            best_mv->as_mv.col = c;
-            bestaddress = check_here;
-          }
-        }
-
-        check_here++;
-        c++;
-      }
-    }
-
-    while (c < col_max) {
-      thissad = fn_ptr->sdf(what, what_stride, check_here, in_what_stride);
-
-      if (thissad < bestsad) {
-        this_mv.as_mv.col = c;
-        thissad +=
-            mvsad_err_cost(&this_mv, &fcenter_mv, mvsadcost, sad_per_bit);
-
-        if (thissad < bestsad) {
-          bestsad = thissad;
-          best_mv->as_mv.row = r;
-          best_mv->as_mv.col = c;
-          bestaddress = check_here;
-        }
-      }
-
-      check_here++;
-      c++;
     }
   }
 
@@ -1631,7 +1368,6 @@
   return fn_ptr->vf(what, what_stride, bestaddress, in_what_stride, &thissad) +
          mv_err_cost(&this_mv, center_mv, mvcost, x->errorperbit);
 }
-#endif  // HAVE_SSE4_1
 
 int vp8_refining_search_sad_c(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                               int_mv *ref_mv, int error_per_bit,
diff --git a/libvpx/vp8/encoder/mcomp.h b/libvpx/vp8/encoder/mcomp.h
index 57c18f5..1ee6fe5 100644
--- a/libvpx/vp8/encoder/mcomp.h
+++ b/libvpx/vp8/encoder/mcomp.h
@@ -50,10 +50,10 @@
 fractional_mv_step_fp vp8_find_best_half_pixel_step;
 fractional_mv_step_fp vp8_skip_fractional_mv_step;
 
-typedef int (*vp8_full_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
-                                    int_mv *ref_mv, int sad_per_bit,
-                                    int distance, vp8_variance_fn_ptr_t *fn_ptr,
-                                    int *mvcost[2], int_mv *center_mv);
+int vp8_full_search_sad(MACROBLOCK *x, BLOCK *b, BLOCKD *d, int_mv *ref_mv,
+                        int sad_per_bit, int distance,
+                        vp8_variance_fn_ptr_t *fn_ptr, int *mvcost[2],
+                        int_mv *center_mv);
 
 typedef int (*vp8_refining_search_fn_t)(MACROBLOCK *x, BLOCK *b, BLOCKD *d,
                                         int_mv *ref_mv, int sad_per_bit,
diff --git a/libvpx/vp8/encoder/onyx_if.c b/libvpx/vp8/encoder/onyx_if.c
index 2b059a1..ffb3867 100644
--- a/libvpx/vp8/encoder/onyx_if.c
+++ b/libvpx/vp8/encoder/onyx_if.c
@@ -36,6 +36,7 @@
 #include "vp8/common/swapyv12buffer.h"
 #include "vp8/common/threading.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #include "vpx_util/vpx_write_yuv_frame.h"
 #if VPX_ARCH_ARM
@@ -183,7 +184,7 @@
 extern FILE *vpxlogc;
 #endif
 
-static void save_layer_context(VP8_COMP *cpi) {
+void vp8_save_layer_context(VP8_COMP *cpi) {
   LAYER_CONTEXT *lc = &cpi->layer_context[cpi->current_layer];
 
   /* Save layer dependent coding state */
@@ -222,7 +223,7 @@
          sizeof(cpi->mb.count_mb_ref_frame_usage));
 }
 
-static void restore_layer_context(VP8_COMP *cpi, const int layer) {
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   /* Restore layer dependent coding state */
@@ -269,9 +270,9 @@
   return (int)(llval * llnum / llden);
 }
 
-static void init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
-                                        const int layer,
-                                        double prev_layer_framerate) {
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate) {
   LAYER_CONTEXT *lc = &cpi->layer_context[layer];
 
   lc->framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[layer];
@@ -336,12 +337,12 @@
   // We need this to set the layer context for the new layers below.
   if (prev_num_layers == 1) {
     cpi->current_layer = 0;
-    save_layer_context(cpi);
+    vp8_save_layer_context(cpi);
   }
   for (i = 0; i < curr_num_layers; ++i) {
     LAYER_CONTEXT *lc = &cpi->layer_context[i];
     if (i >= prev_num_layers) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
     }
     // The initial buffer levels are set based on their starting levels.
     // We could set the buffer levels based on the previous state (normalized
@@ -356,7 +357,7 @@
     // state (to smooth-out quality dips/rate fluctuation at transition)?
 
     // We need to treat the 1 layer case separately: oxcf.target_bitrate[i]
-    // is not set for 1 layer, and the restore_layer_context/save_context()
+    // is not set for 1 layer, and the vp8_restore_layer_context/save_context()
     // are not called in the encoding loop, so we need to call it here to
     // pass the layer context state to |cpi|.
     if (curr_num_layers == 1) {
@@ -364,7 +365,7 @@
       lc->buffer_level =
           cpi->oxcf.starting_buffer_level_in_ms * lc->target_bandwidth / 1000;
       lc->bits_off_target = lc->buffer_level;
-      restore_layer_context(cpi, 0);
+      vp8_restore_layer_context(cpi, 0);
     }
     prev_layer_framerate = cpi->output_framerate / cpi->oxcf.rate_decimator[i];
   }
@@ -394,16 +395,13 @@
 
 static void dealloc_raw_frame_buffers(VP8_COMP *cpi);
 
-void vp8_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vpx_dsp_rtcd();
-    vp8_init_intra_predictors();
-    init_done = 1;
-  }
+static void initialize_enc(void) {
+  vpx_dsp_rtcd();
+  vp8_init_intra_predictors();
 }
 
+void vp8_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP8_COMP *cpi) {
   vpx_free(cpi->tplist);
   cpi->tplist = NULL;
@@ -1023,7 +1021,7 @@
 
       memset(cpi->mb.error_bins, 0, sizeof(cpi->mb.error_bins));
 
-  }; /* switch */
+  } /* switch */
 
   /* Slow quant, dct and trellis not worthwhile for first pass
    * so make sure they are always turned off.
@@ -1274,7 +1272,7 @@
   cpi->framerate = framerate;
   cpi->output_framerate = framerate;
   cpi->per_frame_bandwidth =
-      (int)(cpi->oxcf.target_bandwidth / cpi->output_framerate);
+      (int)round(cpi->oxcf.target_bandwidth / cpi->output_framerate);
   cpi->av_per_frame_bandwidth = cpi->per_frame_bandwidth;
   cpi->min_frame_bandwidth = (int)(cpi->av_per_frame_bandwidth *
                                    cpi->oxcf.two_pass_vbrmin_section / 100);
@@ -1365,7 +1363,7 @@
     double prev_layer_framerate = 0;
 
     for (i = 0; i < cpi->oxcf.number_of_layers; ++i) {
-      init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
+      vp8_init_temporal_layer_context(cpi, oxcf, i, prev_layer_framerate);
       prev_layer_framerate =
           cpi->output_framerate / cpi->oxcf.rate_decimator[i];
     }
@@ -1382,7 +1380,7 @@
 #endif
 }
 
-static void update_layer_contexts(VP8_COMP *cpi) {
+void vp8_update_layer_contexts(VP8_COMP *cpi) {
   VP8_CONFIG *oxcf = &cpi->oxcf;
 
   /* Update snapshots of the layer contexts to reflect new parameters */
@@ -1417,8 +1415,8 @@
       /* Work out the average size of a frame within this layer */
       if (i > 0) {
         lc->avg_frame_size_for_layer =
-            (int)((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
-                  1000 / (lc->framerate - prev_layer_framerate));
+            (int)round((oxcf->target_bitrate[i] - oxcf->target_bitrate[i - 1]) *
+                       1000 / (lc->framerate - prev_layer_framerate));
       }
 
       prev_layer_framerate = lc->framerate;
@@ -1910,6 +1908,7 @@
 
   cpi->force_maxqp = 0;
   cpi->frames_since_last_drop_overshoot = 0;
+  cpi->rt_always_update_correction_factor = 0;
 
   cpi->b_calculate_psnr = CONFIG_INTERNAL_STATS;
 #if CONFIG_INTERNAL_STATS
@@ -2013,36 +2012,26 @@
   cpi->fn_ptr[BLOCK_16X16].sdf = vpx_sad16x16;
   cpi->fn_ptr[BLOCK_16X16].vf = vpx_variance16x16;
   cpi->fn_ptr[BLOCK_16X16].svf = vpx_sub_pixel_variance16x16;
-  cpi->fn_ptr[BLOCK_16X16].sdx3f = vpx_sad16x16x3;
-  cpi->fn_ptr[BLOCK_16X16].sdx8f = vpx_sad16x16x8;
   cpi->fn_ptr[BLOCK_16X16].sdx4df = vpx_sad16x16x4d;
 
   cpi->fn_ptr[BLOCK_16X8].sdf = vpx_sad16x8;
   cpi->fn_ptr[BLOCK_16X8].vf = vpx_variance16x8;
   cpi->fn_ptr[BLOCK_16X8].svf = vpx_sub_pixel_variance16x8;
-  cpi->fn_ptr[BLOCK_16X8].sdx3f = vpx_sad16x8x3;
-  cpi->fn_ptr[BLOCK_16X8].sdx8f = vpx_sad16x8x8;
   cpi->fn_ptr[BLOCK_16X8].sdx4df = vpx_sad16x8x4d;
 
   cpi->fn_ptr[BLOCK_8X16].sdf = vpx_sad8x16;
   cpi->fn_ptr[BLOCK_8X16].vf = vpx_variance8x16;
   cpi->fn_ptr[BLOCK_8X16].svf = vpx_sub_pixel_variance8x16;
-  cpi->fn_ptr[BLOCK_8X16].sdx3f = vpx_sad8x16x3;
-  cpi->fn_ptr[BLOCK_8X16].sdx8f = vpx_sad8x16x8;
   cpi->fn_ptr[BLOCK_8X16].sdx4df = vpx_sad8x16x4d;
 
   cpi->fn_ptr[BLOCK_8X8].sdf = vpx_sad8x8;
   cpi->fn_ptr[BLOCK_8X8].vf = vpx_variance8x8;
   cpi->fn_ptr[BLOCK_8X8].svf = vpx_sub_pixel_variance8x8;
-  cpi->fn_ptr[BLOCK_8X8].sdx3f = vpx_sad8x8x3;
-  cpi->fn_ptr[BLOCK_8X8].sdx8f = vpx_sad8x8x8;
   cpi->fn_ptr[BLOCK_8X8].sdx4df = vpx_sad8x8x4d;
 
   cpi->fn_ptr[BLOCK_4X4].sdf = vpx_sad4x4;
   cpi->fn_ptr[BLOCK_4X4].vf = vpx_variance4x4;
   cpi->fn_ptr[BLOCK_4X4].svf = vpx_sub_pixel_variance4x4;
-  cpi->fn_ptr[BLOCK_4X4].sdx3f = vpx_sad4x4x3;
-  cpi->fn_ptr[BLOCK_4X4].sdx8f = vpx_sad4x4x8;
   cpi->fn_ptr[BLOCK_4X4].sdx4df = vpx_sad4x4x4d;
 
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
@@ -2053,7 +2042,6 @@
   cpi->fn_ptr[BLOCK_4X4].copymem = vp8_copy32xn;
 #endif
 
-  cpi->full_search_sad = vp8_full_search_sad;
   cpi->diamond_search_sad = vp8_diamond_search_sad;
   cpi->refining_search_sad = vp8_refining_search_sad;
 
@@ -3260,7 +3248,7 @@
 #endif  // !CONFIG_REALTIME_ONLY
     default:
       cpi->per_frame_bandwidth =
-          (int)(cpi->target_bandwidth / cpi->output_framerate);
+          (int)round(cpi->target_bandwidth / cpi->output_framerate);
       break;
   }
 
@@ -3480,7 +3468,7 @@
    * Note that dropping a key frame can be problematic if spatial
    * resampling is also active
    */
-  if (cpi->decimation_factor > 0) {
+  if (cpi->decimation_factor > 0 && cpi->drop_frames_allowed) {
     switch (cpi->decimation_factor) {
       case 1:
         cpi->per_frame_bandwidth = cpi->per_frame_bandwidth * 3 / 2;
@@ -4016,7 +4004,8 @@
     if (frame_over_shoot_limit == 0) frame_over_shoot_limit = 1;
 
     /* Are we are overshooting and up against the limit of active max Q. */
-    if (((cpi->pass != 2) ||
+    if (!cpi->rt_always_update_correction_factor &&
+        ((cpi->pass != 2) ||
          (cpi->oxcf.end_usage == USAGE_STREAM_FROM_SERVER)) &&
         (Q == cpi->active_worst_quality) &&
         (cpi->active_worst_quality < cpi->worst_quality) &&
@@ -4514,10 +4503,10 @@
     cpi->bits_off_target = cpi->oxcf.maximum_buffer_size;
   }
 
-  // If the frame dropper is not enabled, don't let the buffer level go below
-  // some threshold, given here by -|maximum_buffer_size|. For now we only do
-  // this for screen content input.
-  if (cpi->drop_frames_allowed == 0 && cpi->oxcf.screen_content_mode &&
+  // Don't let the buffer level go below some threshold, given here
+  // by -|maximum_buffer_size|. For now we only do this for
+  // screen content input.
+  if (cpi->oxcf.screen_content_mode &&
       cpi->bits_off_target < -cpi->oxcf.maximum_buffer_size) {
     cpi->bits_off_target = -cpi->oxcf.maximum_buffer_size;
   }
@@ -4552,8 +4541,8 @@
 
     for (i = cpi->current_layer + 1; i < cpi->oxcf.number_of_layers; ++i) {
       LAYER_CONTEXT *lc = &cpi->layer_context[i];
-      int bits_off_for_this_layer = (int)(lc->target_bandwidth / lc->framerate -
-                                          cpi->projected_frame_size);
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi->projected_frame_size);
 
       lc->bits_off_target += bits_off_for_this_layer;
 
@@ -4990,7 +4979,7 @@
   if (cpi->oxcf.number_of_layers > 1) {
     int layer;
 
-    update_layer_contexts(cpi);
+    vp8_update_layer_contexts(cpi);
 
     /* Restore layer specific context & set frame rate */
     if (cpi->temporal_layer_id >= 0) {
@@ -5000,7 +4989,7 @@
           cpi->oxcf
               .layer_id[cpi->temporal_pattern_counter % cpi->oxcf.periodicity];
     }
-    restore_layer_context(cpi, layer);
+    vp8_restore_layer_context(cpi, layer);
     vp8_new_framerate(cpi, cpi->layer_context[layer].framerate);
   }
 
@@ -5131,7 +5120,7 @@
   }
 
   /* Save layer specific state */
-  if (cpi->oxcf.number_of_layers > 1) save_layer_context(cpi);
+  if (cpi->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi);
 
   vpx_usec_timer_mark(&cmptimer);
   cpi->time_compress_data += vpx_usec_timer_elapsed(&cmptimer);
diff --git a/libvpx/vp8/encoder/onyx_int.h b/libvpx/vp8/encoder/onyx_int.h
index b96f9b1..424f51b 100644
--- a/libvpx/vp8/encoder/onyx_int.h
+++ b/libvpx/vp8/encoder/onyx_int.h
@@ -549,7 +549,6 @@
   unsigned char *partition_d_end[MAX_PARTITIONS];
 
   fractional_mv_step_fp *find_fractional_mv_step;
-  vp8_full_search_fn_t full_search_sad;
   vp8_refining_search_fn_t refining_search_sad;
   vp8_diamond_search_fn_t diamond_search_sad;
   vp8_variance_fn_ptr_t fn_ptr[BLOCK_MAX_SEGMENTS];
@@ -702,12 +701,22 @@
   int use_roi_static_threshold;
 
   int ext_refresh_frame_flags_pending;
+
+  // Always update correction factor used for rate control after each frame for
+  // realtime encoding.
+  int rt_always_update_correction_factor;
 } VP8_COMP;
 
 void vp8_initialize_enc(void);
 
 void vp8_alloc_compressor_data(VP8_COMP *cpi);
 int vp8_reverse_trans(int x);
+void vp8_init_temporal_layer_context(VP8_COMP *cpi, VP8_CONFIG *oxcf,
+                                     const int layer,
+                                     double prev_layer_framerate);
+void vp8_update_layer_contexts(VP8_COMP *cpi);
+void vp8_save_layer_context(VP8_COMP *cpi);
+void vp8_restore_layer_context(VP8_COMP *cpi, const int layer);
 void vp8_new_framerate(VP8_COMP *cpi, double framerate);
 void vp8_loopfilter_frame(VP8_COMP *cpi, VP8_COMMON *cm);
 
diff --git a/libvpx/vp8/encoder/ratectrl.c b/libvpx/vp8/encoder/ratectrl.c
index 59a4fad..9cd3963 100644
--- a/libvpx/vp8/encoder/ratectrl.c
+++ b/libvpx/vp8/encoder/ratectrl.c
@@ -314,7 +314,7 @@
      * bandwidth per second * fraction of the initial buffer
      * level
      */
-    target = cpi->oxcf.starting_buffer_level / 2;
+    target = (uint64_t)cpi->oxcf.starting_buffer_level / 2;
 
     if (target > cpi->oxcf.target_bandwidth * 3 / 2) {
       target = cpi->oxcf.target_bandwidth * 3 / 2;
@@ -327,7 +327,8 @@
     int initial_boost = 32; /* |3.0 * per_frame_bandwidth| */
     /* Boost depends somewhat on frame rate: only used for 1 layer case. */
     if (cpi->oxcf.number_of_layers == 1) {
-      kf_boost = VPXMAX(initial_boost, (int)(2 * cpi->output_framerate - 16));
+      kf_boost =
+          VPXMAX(initial_boost, (int)round(2 * cpi->output_framerate - 16));
     } else {
       /* Initial factor: set target size to: |3.0 * per_frame_bandwidth|. */
       kf_boost = initial_boost;
diff --git a/libvpx/vp8/encoder/rdopt.c b/libvpx/vp8/encoder/rdopt.c
index 79a858e..5821fc7 100644
--- a/libvpx/vp8/encoder/rdopt.c
+++ b/libvpx/vp8/encoder/rdopt.c
@@ -1097,8 +1097,8 @@
             vp8_clamp_mv(&mvp_full, x->mv_col_min, x->mv_col_max, x->mv_row_min,
                          x->mv_row_max);
 
-            thissme = cpi->full_search_sad(x, c, e, &mvp_full, sadpb, 16,
-                                           v_fn_ptr, x->mvcost, bsi->ref_mv);
+            thissme = vp8_full_search_sad(x, c, e, &mvp_full, sadpb, 16,
+                                          v_fn_ptr, x->mvcost, bsi->ref_mv);
 
             if (thissme < bestsme) {
               bestsme = thissme;
diff --git a/libvpx/vp8/encoder/x86/quantize_sse4.c b/libvpx/vp8/encoder/x86/quantize_sse4.c
index 389c167..6d03365 100644
--- a/libvpx/vp8/encoder/x86/quantize_sse4.c
+++ b/libvpx/vp8/encoder/x86/quantize_sse4.c
@@ -11,28 +11,14 @@
 #include <smmintrin.h> /* SSE4.1 */
 
 #include "./vp8_rtcd.h"
-#include "vp8/common/entropy.h" /* vp8_default_inv_zig_zag */
 #include "vp8/encoder/block.h"
-
-#define SELECT_EOB(i, z, x, y, q)                         \
-  do {                                                    \
-    short boost = *zbin_boost_ptr;                        \
-    /* Technically _mm_extract_epi16() returns an int: */ \
-    /* https://bugs.llvm.org/show_bug.cgi?id=41657 */     \
-    short x_z = (short)_mm_extract_epi16(x, z);           \
-    short y_z = (short)_mm_extract_epi16(y, z);           \
-    int cmp = (x_z < boost) | (y_z == 0);                 \
-    zbin_boost_ptr++;                                     \
-    if (cmp) break;                                       \
-    q = _mm_insert_epi16(q, y_z, z);                      \
-    eob = i;                                              \
-    zbin_boost_ptr = b->zrun_zbin_boost;                  \
-  } while (0)
+#include "vpx_ports/bitops.h" /* get_lsb */
 
 void vp8_regular_quantize_b_sse4_1(BLOCK *b, BLOCKD *d) {
-  char eob = 0;
+  int eob = -1;
   short *zbin_boost_ptr = b->zrun_zbin_boost;
-
+  __m128i zbin_boost0 = _mm_load_si128((__m128i *)(zbin_boost_ptr));
+  __m128i zbin_boost1 = _mm_load_si128((__m128i *)(zbin_boost_ptr + 8));
   __m128i x0, x1, y0, y1, x_minus_zbin0, x_minus_zbin1, dqcoeff0, dqcoeff1;
   __m128i quant_shift0 = _mm_load_si128((__m128i *)(b->quant_shift));
   __m128i quant_shift1 = _mm_load_si128((__m128i *)(b->quant_shift + 8));
@@ -47,8 +33,12 @@
   __m128i quant1 = _mm_load_si128((__m128i *)(b->quant + 8));
   __m128i dequant0 = _mm_load_si128((__m128i *)(d->dequant));
   __m128i dequant1 = _mm_load_si128((__m128i *)(d->dequant + 8));
-  __m128i qcoeff0 = _mm_setzero_si128();
-  __m128i qcoeff1 = _mm_setzero_si128();
+  __m128i qcoeff0, qcoeff1, t0, t1, x_shuf0, x_shuf1;
+  uint32_t mask, ymask;
+  DECLARE_ALIGNED(16, static const uint8_t,
+                  zig_zag_mask[16]) = { 0, 1,  4,  8,  5, 2,  3,  6,
+                                        9, 12, 13, 10, 7, 11, 14, 15 };
+  DECLARE_ALIGNED(16, uint16_t, qcoeff[16]) = { 0 };
 
   /* Duplicate to all lanes. */
   zbin_extra = _mm_shufflelo_epi16(zbin_extra, 0);
@@ -88,23 +78,52 @@
   y0 = _mm_sign_epi16(y0, z0);
   y1 = _mm_sign_epi16(y1, z1);
 
-  /* The loop gets unrolled anyway. Avoid the vp8_default_zig_zag1d lookup. */
-  SELECT_EOB(1, 0, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(2, 1, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(3, 4, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(4, 0, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(5, 5, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(6, 2, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(7, 3, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(8, 6, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(9, 1, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(10, 4, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(11, 5, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(12, 2, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(13, 7, x_minus_zbin0, y0, qcoeff0);
-  SELECT_EOB(14, 3, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(15, 6, x_minus_zbin1, y1, qcoeff1);
-  SELECT_EOB(16, 7, x_minus_zbin1, y1, qcoeff1);
+  {
+    const __m128i zig_zag_i16_0 =
+        _mm_setr_epi8(0, 1, 2, 3, 8, 9, 14, 15, 10, 11, 4, 5, 6, 7, 12, 13);
+    const __m128i zig_zag_i16_1 =
+        _mm_setr_epi8(0, 1, 6, 7, 8, 9, 2, 3, 14, 15, 4, 5, 10, 11, 12, 13);
+
+    /* The first part of the zig zag needs a value
+     * from x_minus_zbin1 and vice versa. */
+    t1 = _mm_alignr_epi8(x_minus_zbin1, x_minus_zbin1, 2);
+    t0 = _mm_blend_epi16(x_minus_zbin0, t1, 0x80);
+    t1 = _mm_blend_epi16(t1, x_minus_zbin0, 0x80);
+    x_shuf0 = _mm_shuffle_epi8(t0, zig_zag_i16_0);
+    x_shuf1 = _mm_shuffle_epi8(t1, zig_zag_i16_1);
+  }
+
+  /* Check if y is nonzero and put it in zig zag order. */
+  t0 = _mm_packs_epi16(y0, y1);
+  t0 = _mm_cmpeq_epi8(t0, _mm_setzero_si128());
+  t0 = _mm_shuffle_epi8(t0, _mm_load_si128((const __m128i *)zig_zag_mask));
+  ymask = _mm_movemask_epi8(t0) ^ 0xffff;
+
+  for (;;) {
+    t0 = _mm_cmpgt_epi16(zbin_boost0, x_shuf0);
+    t1 = _mm_cmpgt_epi16(zbin_boost1, x_shuf1);
+    t0 = _mm_packs_epi16(t0, t1);
+    mask = _mm_movemask_epi8(t0);
+    mask = ~mask & ymask;
+    if (!mask) break;
+    /* |eob| will contain the index of the next found element where:
+     * boost[i - old_eob - 1] <= x[zigzag[i]] && y[zigzag[i]] != 0 */
+    eob = get_lsb(mask);
+    /* Need to clear the mask from processed elements so that
+     * they are no longer counted in the next iteration. */
+    ymask &= ~1U << eob;
+    /* It's safe to read ahead of this buffer if struct VP8_COMP has at
+     * least 32 bytes before the zrun_zbin_boost_* fields (it has 384).
+     * Any data read outside of the buffer is masked by the updated |ymask|. */
+    zbin_boost0 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob - 1));
+    zbin_boost1 = _mm_loadu_si128((__m128i *)(zbin_boost_ptr - eob + 7));
+    qcoeff[zig_zag_mask[eob]] = 0xffff;
+  }
+
+  qcoeff0 = _mm_load_si128((__m128i *)(qcoeff));
+  qcoeff1 = _mm_load_si128((__m128i *)(qcoeff + 8));
+  qcoeff0 = _mm_and_si128(qcoeff0, y0);
+  qcoeff1 = _mm_and_si128(qcoeff1, y1);
 
   _mm_store_si128((__m128i *)(d->qcoeff), qcoeff0);
   _mm_store_si128((__m128i *)(d->qcoeff + 8), qcoeff1);
@@ -115,5 +134,5 @@
   _mm_store_si128((__m128i *)(d->dqcoeff), dqcoeff0);
   _mm_store_si128((__m128i *)(d->dqcoeff + 8), dqcoeff1);
 
-  *d->eob = eob;
+  *d->eob = eob + 1;
 }
diff --git a/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
index 147c30c..f6df146 100644
--- a/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
+++ b/libvpx/vp8/encoder/x86/vp8_quantize_ssse3.c
@@ -12,31 +12,7 @@
 
 #include "./vp8_rtcd.h"
 #include "vp8/encoder/block.h"
-
-/* bitscan reverse (bsr) */
-#if defined(_MSC_VER)
-#include <intrin.h>
-#pragma intrinsic(_BitScanReverse)
-static int bsr(int mask) {
-  unsigned long eob;
-  _BitScanReverse(&eob, mask);
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#else
-static int bsr(int mask) {
-  int eob;
-#if defined(__GNUC__) && __GNUC__
-  __asm__ __volatile__("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#elif defined(__SUNPRO_C) || defined(__SUNPRO_CC)
-  asm volatile("bsr %1, %0" : "=r"(eob) : "r"(mask) : "flags");
-#endif
-  eob++;
-  if (mask == 0) eob = 0;
-  return eob;
-}
-#endif
+#include "vpx_ports/bitops.h" /* get_msb */
 
 void vp8_fast_quantize_b_ssse3(BLOCK *b, BLOCKD *d) {
   int eob, mask;
@@ -108,7 +84,10 @@
 
   mask = _mm_movemask_epi8(x);
 
-  eob = bsr(mask);
+  /* x2 is needed to increase the result from non-zero masks by 1,
+   * +1 is needed to mask undefined behavior for a null argument,
+   * the result of get_msb(1) is 0 */
+  eob = get_msb(mask * 2 + 1);
 
-  *d->eob = 0xFF & eob;
+  *d->eob = eob;
 }
diff --git a/libvpx/vp8/vp8_common.mk b/libvpx/vp8/vp8_common.mk
index 286a93a..d485965 100644
--- a/libvpx/vp8/vp8_common.mk
+++ b/libvpx/vp8/vp8_common.mk
@@ -124,6 +124,11 @@
 VP8_COMMON_SRCS-$(HAVE_MSA) += common/mips/msa/mfqe_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/loopfilter_filters_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/sixtap_filter_lsx.c
+VP8_COMMON_SRCS-$(HAVE_LSX) += common/loongarch/idct_lsx.c
+
 # common (neon intrinsics)
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.c
 VP8_COMMON_SRCS-$(HAVE_NEON)  += common/arm/loopfilter_arm.h
diff --git a/libvpx/vp8/vp8_cx_iface.c b/libvpx/vp8/vp8_cx_iface.c
index b5865ce..340f3e6 100644
--- a/libvpx/vp8/vp8_cx_iface.c
+++ b/libvpx/vp8/vp8_cx_iface.c
@@ -18,7 +18,6 @@
 #include "vpx_mem/vpx_mem.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_util/vpx_timestamp.h"
 #include "vp8/encoder/onyx_int.h"
 #include "vpx/vp8cx.h"
@@ -474,14 +473,23 @@
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->vp8_cfg, 0);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (!res) {
-    ctx->cfg = *cfg;
-    set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
-    vp8_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
-  return res;
+  ctx->cpi->common.error.setjmp = 1;
+  ctx->cfg = *cfg;
+  set_vp8e_config(&ctx->oxcf, ctx->cfg, ctx->vp8_cfg, NULL);
+  vp8_change_config(ctx->cpi, &ctx->oxcf);
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t get_quantizer(vpx_codec_alg_priv_t *ctx, va_list args) {
@@ -607,6 +615,17 @@
   return update_extracfg(ctx, &extra_cfg);
 }
 
+static vpx_codec_err_t ctrl_set_rtc_external_ratectrl(vpx_codec_alg_priv_t *ctx,
+                                                      va_list args) {
+  VP8_COMP *cpi = ctx->cpi;
+  const unsigned int data = CAST(VP8E_SET_GF_CBR_BOOST_PCT, args);
+  if (data) {
+    cpi->cyclic_refresh_mode_enabled = 0;
+    cpi->rt_always_update_correction_factor = 1;
+  }
+  return VPX_CODEC_OK;
+}
+
 static vpx_codec_err_t vp8e_mr_alloc_mem(const vpx_codec_enc_cfg_t *cfg,
                                          void **mem_loc) {
   vpx_codec_err_t res = VPX_CODEC_OK;
@@ -683,7 +702,7 @@
       ctx->priv->enc.total_encoders = 1;
     }
 
-    once(vp8_initialize_enc);
+    vp8_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->vp8_cfg, 0);
 
@@ -1245,6 +1264,7 @@
   { VP8E_SET_MAX_INTRA_BITRATE_PCT, set_rc_max_intra_bitrate_pct },
   { VP8E_SET_SCREEN_CONTENT_MODE, set_screen_content_mode },
   { VP8E_SET_GF_CBR_BOOST_PCT, ctrl_set_rc_gf_cbr_boost_pct },
+  { VP8E_SET_RTC_EXTERNAL_RATECTRL, ctrl_set_rtc_external_ratectrl },
   { -1, NULL },
 };
 
diff --git a/libvpx/vp8/vp8_dx_iface.c b/libvpx/vp8/vp8_dx_iface.c
index ba0714a..6d88e51 100644
--- a/libvpx/vp8/vp8_dx_iface.c
+++ b/libvpx/vp8/vp8_dx_iface.c
@@ -371,8 +371,6 @@
       pc->Width = ctx->si.w;
       pc->Height = ctx->si.h;
       {
-        int prev_mb_rows = pc->mb_rows;
-
         if (setjmp(pbi->common.error.jmp)) {
           pbi->common.error.setjmp = 0;
           /* on failure clear the cached resolution to ensure a full
@@ -398,6 +396,12 @@
                              "Invalid frame height");
         }
 
+#if CONFIG_MULTITHREAD
+        if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
+          vp8mt_de_alloc_temp_buffers(pbi, pc->mb_rows);
+        }
+#endif
+
         if (vp8_alloc_frame_buffers(pc, pc->Width, pc->Height)) {
           vpx_internal_error(&pc->error, VPX_CODEC_MEM_ERROR,
                              "Failed to allocate frame buffers");
@@ -442,10 +446,8 @@
 
 #if CONFIG_MULTITHREAD
         if (vpx_atomic_load_acquire(&pbi->b_multithreaded_rd)) {
-          vp8mt_alloc_temp_buffers(pbi, pc->Width, prev_mb_rows);
+          vp8mt_alloc_temp_buffers(pbi, pc->Width, 0);
         }
-#else
-        (void)prev_mb_rows;
 #endif
       }
 
diff --git a/libvpx/vp8/vp8_ratectrl_rtc.cc b/libvpx/vp8/vp8_ratectrl_rtc.cc
new file mode 100644
index 0000000..2f23c5b
--- /dev/null
+++ b/libvpx/vp8/vp8_ratectrl_rtc.cc
@@ -0,0 +1,347 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <math.h>
+#include <new>
+#include "vp8/vp8_ratectrl_rtc.h"
+#include "vp8/encoder/ratectrl.h"
+#include "vpx_ports/system_state.h"
+
+namespace libvpx {
+/* Quant MOD */
+static const int kQTrans[] = {
+  0,  1,  2,  3,  4,  5,  7,   8,   9,   10,  12,  13,  15,  17,  18,  19,
+  20, 21, 23, 24, 25, 26, 27,  28,  29,  30,  31,  33,  35,  37,  39,  41,
+  43, 45, 47, 49, 51, 53, 55,  57,  59,  61,  64,  67,  70,  73,  76,  79,
+  82, 85, 88, 91, 94, 97, 100, 103, 106, 109, 112, 115, 118, 121, 124, 127,
+};
+
+static const unsigned char kf_high_motion_minq[QINDEX_RANGE] = {
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
+  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  1,  1,
+  1,  1,  2,  2,  2,  2,  3,  3,  3,  3,  3,  3,  3,  3,  4,  4,  4,  4,  5,
+  5,  5,  5,  5,  5,  6,  6,  6,  6,  7,  7,  8,  8,  8,  8,  9,  9,  10, 10,
+  10, 10, 11, 11, 11, 11, 12, 12, 13, 13, 13, 13, 14, 14, 15, 15, 15, 15, 16,
+  16, 16, 16, 17, 17, 18, 18, 18, 18, 19, 19, 20, 20, 20, 20, 21, 21, 21, 21,
+  22, 22, 23, 23, 24, 25, 25, 26, 26, 27, 28, 28, 29, 30
+};
+
+static const unsigned char inter_minq[QINDEX_RANGE] = {
+  0,  0,  1,  1,  2,  3,  3,  4,  4,  5,  6,  6,  7,  8,  8,  9,  9,  10, 11,
+  11, 12, 13, 13, 14, 15, 15, 16, 17, 17, 18, 19, 20, 20, 21, 22, 22, 23, 24,
+  24, 25, 26, 27, 27, 28, 29, 30, 30, 31, 32, 33, 33, 34, 35, 36, 36, 37, 38,
+  39, 39, 40, 41, 42, 42, 43, 44, 45, 46, 46, 47, 48, 49, 50, 50, 51, 52, 53,
+  54, 55, 55, 56, 57, 58, 59, 60, 60, 61, 62, 63, 64, 65, 66, 67, 67, 68, 69,
+  70, 71, 72, 73, 74, 75, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 86,
+  87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100
+};
+
+static int rescale(int val, int num, int denom) {
+  int64_t llnum = num;
+  int64_t llden = denom;
+  int64_t llval = val;
+
+  return (int)(llval * llnum / llden);
+}
+
+std::unique_ptr<VP8RateControlRTC> VP8RateControlRTC::Create(
+    const VP8RateControlRtcConfig &cfg) {
+  std::unique_ptr<VP8RateControlRTC> rc_api(new (std::nothrow)
+                                                VP8RateControlRTC());
+  if (!rc_api) return nullptr;
+  rc_api->cpi_ = static_cast<VP8_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
+  if (!rc_api->cpi_) return nullptr;
+  vp8_zero(*rc_api->cpi_);
+
+  rc_api->InitRateControl(cfg);
+
+  return rc_api;
+}
+
+void VP8RateControlRTC::InitRateControl(const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  oxcf->end_usage = USAGE_STREAM_FROM_SERVER;
+  cpi_->pass = 0;
+  cm->show_frame = 1;
+  oxcf->drop_frames_water_mark = 0;
+  cm->current_video_frame = 0;
+  cpi_->auto_gold = 1;
+  cpi_->key_frame_count = 1;
+  cpi_->rate_correction_factor = 1.0;
+  cpi_->key_frame_rate_correction_factor = 1.0;
+  cpi_->cyclic_refresh_mode_enabled = 0;
+  cpi_->auto_worst_q = 1;
+  cpi_->kf_overspend_bits = 0;
+  cpi_->kf_bitrate_adjustment = 0;
+  cpi_->gf_overspend_bits = 0;
+  cpi_->non_gf_bitrate_adjustment = 0;
+  UpdateRateControl(rc_cfg);
+  cpi_->buffer_level = oxcf->starting_buffer_level;
+  cpi_->bits_off_target = oxcf->starting_buffer_level;
+}
+
+void VP8RateControlRTC::UpdateRateControl(
+    const VP8RateControlRtcConfig &rc_cfg) {
+  VP8_COMMON *cm = &cpi_->common;
+  VP8_CONFIG *oxcf = &cpi_->oxcf;
+  vpx_clear_system_state();
+  cm->Width = rc_cfg.width;
+  cm->Height = rc_cfg.height;
+  oxcf->Width = rc_cfg.width;
+  oxcf->Height = rc_cfg.height;
+  oxcf->worst_allowed_q = kQTrans[rc_cfg.max_quantizer];
+  oxcf->best_allowed_q = kQTrans[rc_cfg.min_quantizer];
+  cpi_->worst_quality = oxcf->worst_allowed_q;
+  cpi_->best_quality = oxcf->best_allowed_q;
+  cpi_->output_framerate = rc_cfg.framerate;
+  oxcf->target_bandwidth =
+      static_cast<unsigned int>(1000 * rc_cfg.target_bandwidth);
+  cpi_->ref_framerate = cpi_->output_framerate;
+  oxcf->fixed_q = -1;
+  oxcf->error_resilient_mode = 1;
+  oxcf->starting_buffer_level_in_ms = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level_in_ms = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size_in_ms = rc_cfg.buf_sz;
+  oxcf->starting_buffer_level = rc_cfg.buf_initial_sz;
+  oxcf->optimal_buffer_level = rc_cfg.buf_optimal_sz;
+  oxcf->maximum_buffer_size = rc_cfg.buf_sz;
+  oxcf->number_of_layers = rc_cfg.ts_number_layers;
+  cpi_->buffered_mode = oxcf->optimal_buffer_level > 0;
+  oxcf->under_shoot_pct = rc_cfg.undershoot_pct;
+  oxcf->over_shoot_pct = rc_cfg.overshoot_pct;
+  cpi_->oxcf.rc_max_intra_bitrate_pct = rc_cfg.max_intra_bitrate_pct;
+  cpi_->framerate = rc_cfg.framerate;
+  for (int i = 0; i < KEY_FRAME_CONTEXT; ++i) {
+    cpi_->prior_key_frame_distance[i] =
+        static_cast<int>(cpi_->output_framerate);
+  }
+
+  if (oxcf->number_of_layers > 1) {
+    memcpy(oxcf->target_bitrate, rc_cfg.layer_target_bitrate,
+           sizeof(rc_cfg.layer_target_bitrate));
+    memcpy(oxcf->rate_decimator, rc_cfg.ts_rate_decimator,
+           sizeof(rc_cfg.ts_rate_decimator));
+    oxcf->periodicity = 2;
+
+    double prev_layer_framerate = 0;
+    for (unsigned int i = 0; i < oxcf->number_of_layers; ++i) {
+      vp8_init_temporal_layer_context(cpi_, oxcf, i, prev_layer_framerate);
+      prev_layer_framerate = cpi_->output_framerate / oxcf->rate_decimator[i];
+    }
+  }
+
+  cpi_->total_actual_bits = 0;
+  cpi_->total_target_vs_actual = 0;
+
+  cm->mb_rows = cm->Height >> 4;
+  cm->mb_cols = cm->Width >> 4;
+  cm->MBs = cm->mb_rows * cm->mb_cols;
+  cm->mode_info_stride = cm->mb_cols + 1;
+
+  oxcf->starting_buffer_level =
+      rescale((int)oxcf->starting_buffer_level, oxcf->target_bandwidth, 1000);
+  /* Set or reset optimal and maximum buffer levels. */
+  if (oxcf->optimal_buffer_level == 0) {
+    oxcf->optimal_buffer_level = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->optimal_buffer_level =
+        rescale((int)oxcf->optimal_buffer_level, oxcf->target_bandwidth, 1000);
+  }
+  if (oxcf->maximum_buffer_size == 0) {
+    oxcf->maximum_buffer_size = oxcf->target_bandwidth / 8;
+  } else {
+    oxcf->maximum_buffer_size =
+        rescale((int)oxcf->maximum_buffer_size, oxcf->target_bandwidth, 1000);
+  }
+
+  if (cpi_->bits_off_target > oxcf->maximum_buffer_size) {
+    cpi_->bits_off_target = oxcf->maximum_buffer_size;
+    cpi_->buffer_level = cpi_->bits_off_target;
+  }
+
+  vp8_new_framerate(cpi_, cpi_->framerate);
+  vpx_clear_system_state();
+}
+
+void VP8RateControlRTC::ComputeQP(const VP8FrameParamsQpRTC &frame_params) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  if (cpi_->oxcf.number_of_layers > 1) {
+    cpi_->temporal_layer_id = frame_params.temporal_layer_id;
+    const int layer = frame_params.temporal_layer_id;
+    vp8_update_layer_contexts(cpi_);
+    /* Restore layer specific context & set frame rate */
+    vp8_restore_layer_context(cpi_, layer);
+    vp8_new_framerate(cpi_, cpi_->layer_context[layer].framerate);
+  }
+  cm->frame_type = frame_params.frame_type;
+  cm->refresh_golden_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  cm->refresh_alt_ref_frame = (cm->frame_type == KEY_FRAME) ? 1 : 0;
+  if (cm->frame_type == KEY_FRAME && cpi_->common.current_video_frame > 0) {
+    cpi_->common.frame_flags |= FRAMEFLAGS_KEY;
+  }
+
+  vp8_pick_frame_size(cpi_);
+
+  if (cpi_->buffer_level >= cpi_->oxcf.optimal_buffer_level &&
+      cpi_->buffered_mode) {
+    /* Max adjustment is 1/4 */
+    int Adjustment = cpi_->active_worst_quality / 4;
+    if (Adjustment) {
+      int buff_lvl_step;
+      if (cpi_->buffer_level < cpi_->oxcf.maximum_buffer_size) {
+        buff_lvl_step = (int)((cpi_->oxcf.maximum_buffer_size -
+                               cpi_->oxcf.optimal_buffer_level) /
+                              Adjustment);
+        if (buff_lvl_step) {
+          Adjustment =
+              (int)((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) /
+                    buff_lvl_step);
+        } else {
+          Adjustment = 0;
+        }
+      }
+      cpi_->active_worst_quality -= Adjustment;
+      if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+        cpi_->active_worst_quality = cpi_->active_best_quality;
+      }
+    }
+  }
+
+  if (cpi_->ni_frames > 150) {
+    int q = cpi_->active_worst_quality;
+    if (cm->frame_type == KEY_FRAME) {
+      cpi_->active_best_quality = kf_high_motion_minq[q];
+    } else {
+      cpi_->active_best_quality = inter_minq[q];
+    }
+
+    if (cpi_->buffer_level >= cpi_->oxcf.maximum_buffer_size) {
+      cpi_->active_best_quality = cpi_->best_quality;
+
+    } else if (cpi_->buffer_level > cpi_->oxcf.optimal_buffer_level) {
+      int Fraction =
+          (int)(((cpi_->buffer_level - cpi_->oxcf.optimal_buffer_level) * 128) /
+                (cpi_->oxcf.maximum_buffer_size -
+                 cpi_->oxcf.optimal_buffer_level));
+      int min_qadjustment =
+          ((cpi_->active_best_quality - cpi_->best_quality) * Fraction) / 128;
+
+      cpi_->active_best_quality -= min_qadjustment;
+    }
+  }
+
+  /* Clip the active best and worst quality values to limits */
+  if (cpi_->active_worst_quality > cpi_->worst_quality) {
+    cpi_->active_worst_quality = cpi_->worst_quality;
+  }
+  if (cpi_->active_best_quality < cpi_->best_quality) {
+    cpi_->active_best_quality = cpi_->best_quality;
+  }
+  if (cpi_->active_worst_quality < cpi_->active_best_quality) {
+    cpi_->active_worst_quality = cpi_->active_best_quality;
+  }
+
+  q_ = vp8_regulate_q(cpi_, cpi_->this_frame_target);
+  vp8_set_quantizer(cpi_, q_);
+  vpx_clear_system_state();
+}
+
+int VP8RateControlRTC::GetQP() const { return q_; }
+
+void VP8RateControlRTC::PostEncodeUpdate(uint64_t encoded_frame_size) {
+  VP8_COMMON *const cm = &cpi_->common;
+  vpx_clear_system_state();
+  cpi_->total_byte_count += encoded_frame_size;
+  cpi_->projected_frame_size = static_cast<int>(encoded_frame_size << 3);
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      cpi_->layer_context[i].total_byte_count += encoded_frame_size;
+    }
+  }
+
+  vp8_update_rate_correction_factors(cpi_, 2);
+
+  cpi_->last_q[cm->frame_type] = cm->base_qindex;
+
+  if (cm->frame_type == KEY_FRAME) {
+    vp8_adjust_key_frame_context(cpi_);
+  }
+
+  /* Keep a record of ambient average Q. */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->avg_frame_qindex =
+        (2 + 3 * cpi_->avg_frame_qindex + cm->base_qindex) >> 2;
+  }
+  /* Keep a record from which we can calculate the average Q excluding
+   * key frames.
+   */
+  if (cm->frame_type != KEY_FRAME) {
+    cpi_->ni_frames++;
+    /* Damp value for first few frames */
+    if (cpi_->ni_frames > 150) {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi = (cpi_->ni_tot_qi / cpi_->ni_frames);
+    } else {
+      cpi_->ni_tot_qi += q_;
+      cpi_->ni_av_qi =
+          ((cpi_->ni_tot_qi / cpi_->ni_frames) + cpi_->worst_quality + 1) / 2;
+    }
+
+    /* If the average Q is higher than what was used in the last
+     * frame (after going through the recode loop to keep the frame
+     * size within range) then use the last frame value - 1. The -1
+     * is designed to stop Q and hence the data rate, from
+     * progressively falling away during difficult sections, but at
+     * the same time reduce the number of itterations around the
+     * recode loop.
+     */
+    if (q_ > cpi_->ni_av_qi) cpi_->ni_av_qi = q_ - 1;
+  }
+
+  cpi_->bits_off_target +=
+      cpi_->av_per_frame_bandwidth - cpi_->projected_frame_size;
+  if (cpi_->bits_off_target > cpi_->oxcf.maximum_buffer_size) {
+    cpi_->bits_off_target = cpi_->oxcf.maximum_buffer_size;
+  }
+
+  cpi_->total_actual_bits += cpi_->projected_frame_size;
+  cpi_->buffer_level = cpi_->bits_off_target;
+
+  /* Propagate values to higher temporal layers */
+  if (cpi_->oxcf.number_of_layers > 1) {
+    for (unsigned int i = cpi_->current_layer + 1;
+         i < cpi_->oxcf.number_of_layers; ++i) {
+      LAYER_CONTEXT *lc = &cpi_->layer_context[i];
+      int bits_off_for_this_layer = (int)round(
+          lc->target_bandwidth / lc->framerate - cpi_->projected_frame_size);
+
+      lc->bits_off_target += bits_off_for_this_layer;
+
+      /* Clip buffer level to maximum buffer size for the layer */
+      if (lc->bits_off_target > lc->maximum_buffer_size) {
+        lc->bits_off_target = lc->maximum_buffer_size;
+      }
+
+      lc->total_actual_bits += cpi_->projected_frame_size;
+      lc->total_target_vs_actual += bits_off_for_this_layer;
+      lc->buffer_level = lc->bits_off_target;
+    }
+  }
+
+  cpi_->common.current_video_frame++;
+  cpi_->frames_since_key++;
+
+  if (cpi_->oxcf.number_of_layers > 1) vp8_save_layer_context(cpi_);
+  vpx_clear_system_state();
+}
+}  // namespace libvpx
diff --git a/libvpx/vp8/vp8_ratectrl_rtc.h b/libvpx/vp8/vp8_ratectrl_rtc.h
new file mode 100644
index 0000000..def7dd8
--- /dev/null
+++ b/libvpx/vp8/vp8_ratectrl_rtc.h
@@ -0,0 +1,63 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VP8_RATECTRL_RTC_H_
+#define VPX_VP8_RATECTRL_RTC_H_
+
+#include <cstdint>
+#include <memory>
+
+#include "vp8/encoder/onyx_int.h"
+#include "vp8/common/common.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
+
+namespace libvpx {
+struct VP8RateControlRtcConfig : public VpxRateControlRtcConfig {
+ public:
+  VP8RateControlRtcConfig() {
+    vp8_zero(layer_target_bitrate);
+    vp8_zero(ts_rate_decimator);
+  }
+};
+
+struct VP8FrameParamsQpRTC {
+  FRAME_TYPE frame_type;
+  int temporal_layer_id;
+};
+
+class VP8RateControlRTC {
+ public:
+  static std::unique_ptr<VP8RateControlRTC> Create(
+      const VP8RateControlRtcConfig &cfg);
+  ~VP8RateControlRTC() {
+    if (cpi_) {
+      vpx_free(cpi_->gf_active_flags);
+      vpx_free(cpi_);
+    }
+  }
+
+  void UpdateRateControl(const VP8RateControlRtcConfig &rc_cfg);
+  // GetQP() needs to be called after ComputeQP() to get the latest QP
+  int GetQP() const;
+  // int GetLoopfilterLevel() const;
+  void ComputeQP(const VP8FrameParamsQpRTC &frame_params);
+  // Feedback to rate control with the size of current encoded frame
+  void PostEncodeUpdate(uint64_t encoded_frame_size);
+
+ private:
+  VP8RateControlRTC() {}
+  void InitRateControl(const VP8RateControlRtcConfig &cfg);
+  VP8_COMP *cpi_;
+  int q_;
+};
+
+}  // namespace libvpx
+
+#endif  // VPX_VP8_RATECTRL_RTC_H_
diff --git a/libvpx/vp8/vp8cx.mk b/libvpx/vp8/vp8cx.mk
index 3a8f8ea..5744cba 100644
--- a/libvpx/vp8/vp8cx.mk
+++ b/libvpx/vp8/vp8cx.mk
@@ -124,4 +124,9 @@
 VP8_CX_SRCS_REMOVE-$(HAVE_MSA) += encoder/mips/msa/temporal_filter_msa.c
 endif
 
+# common (loongarch LSX intrinsics)
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/quantize_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/dct_lsx.c
+VP8_CX_SRCS-$(HAVE_LSX) += encoder/loongarch/encodeopt_lsx.c
+
 VP8_CX_SRCS-yes := $(filter-out $(VP8_CX_SRCS_REMOVE-yes),$(VP8_CX_SRCS-yes))
diff --git a/libvpx/vp9/common/vp9_alloccommon.c b/libvpx/vp9/common/vp9_alloccommon.c
index 5702dca..faad657 100644
--- a/libvpx/vp9/common/vp9_alloccommon.c
+++ b/libvpx/vp9/common/vp9_alloccommon.c
@@ -73,6 +73,8 @@
 void vp9_free_ref_frame_buffers(BufferPool *pool) {
   int i;
 
+  if (!pool) return;
+
   for (i = 0; i < FRAME_BUFFERS; ++i) {
     if (!pool->frame_bufs[i].released &&
         pool->frame_bufs[i].raw_frame_buffer.data != NULL) {
@@ -100,7 +102,7 @@
 }
 
 void vp9_free_context_buffers(VP9_COMMON *cm) {
-  cm->free_mi(cm);
+  if (cm->free_mi) cm->free_mi(cm);
   free_seg_map(cm);
   vpx_free(cm->above_context);
   cm->above_context = NULL;
diff --git a/libvpx/vp9/common/vp9_common.h b/libvpx/vp9/common/vp9_common.h
index e3c5535..3cec53b 100644
--- a/libvpx/vp9/common/vp9_common.h
+++ b/libvpx/vp9/common/vp9_common.h
@@ -27,10 +27,10 @@
 
 // Only need this for fixed-size arrays, for structs just assign.
 #define vp9_copy(dest, src)              \
-  {                                      \
+  do {                                   \
     assert(sizeof(dest) == sizeof(src)); \
     memcpy(dest, src, sizeof(src));      \
-  }
+  } while (0)
 
 // Use this for variably-sized arrays.
 #define vp9_copy_array(dest, src, n)           \
diff --git a/libvpx/vp9/common/vp9_frame_buffers.c b/libvpx/vp9/common/vp9_frame_buffers.c
index a254e79..889b809 100644
--- a/libvpx/vp9/common/vp9_frame_buffers.c
+++ b/libvpx/vp9/common/vp9_frame_buffers.c
@@ -14,14 +14,17 @@
 #include "vpx_mem/vpx_mem.h"
 
 int vp9_alloc_internal_frame_buffers(InternalFrameBufferList *list) {
+  const int num_buffers = VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
   assert(list != NULL);
   vp9_free_internal_frame_buffers(list);
 
-  list->num_internal_frame_buffers =
-      VP9_MAXIMUM_REF_BUFFERS + VPX_MAXIMUM_WORK_BUFFERS;
-  list->int_fb = (InternalFrameBuffer *)vpx_calloc(
-      list->num_internal_frame_buffers, sizeof(*list->int_fb));
-  return (list->int_fb == NULL);
+  list->int_fb =
+      (InternalFrameBuffer *)vpx_calloc(num_buffers, sizeof(*list->int_fb));
+  if (list->int_fb) {
+    list->num_internal_frame_buffers = num_buffers;
+    return 0;
+  }
+  return -1;
 }
 
 void vp9_free_internal_frame_buffers(InternalFrameBufferList *list) {
@@ -35,6 +38,7 @@
   }
   vpx_free(list->int_fb);
   list->int_fb = NULL;
+  list->num_internal_frame_buffers = 0;
 }
 
 int vp9_get_frame_buffer(void *cb_priv, size_t min_size,
diff --git a/libvpx/vp9/common/vp9_postproc.c b/libvpx/vp9/common/vp9_postproc.c
index d2c8535..96519f0 100644
--- a/libvpx/vp9/common/vp9_postproc.c
+++ b/libvpx/vp9/common/vp9_postproc.c
@@ -360,6 +360,7 @@
     if (!cm->postproc_state.limits) {
       cm->postproc_state.limits =
           vpx_calloc(unscaled_width, sizeof(*cm->postproc_state.limits));
+      if (!cm->postproc_state.limits) return 1;
     }
   }
 
diff --git a/libvpx/vp9/common/vp9_rtcd.c b/libvpx/vp9/common/vp9_rtcd.c
index d8c870a..37762ca 100644
--- a/libvpx/vp9/common/vp9_rtcd.c
+++ b/libvpx/vp9/common/vp9_rtcd.c
@@ -12,8 +12,4 @@
 #include "./vp9_rtcd.h"
 #include "vpx_ports/vpx_once.h"
 
-void vp9_rtcd() {
-  // TODO(JBB): Remove this once, by insuring that both the encoder and
-  // decoder setup functions are protected by once();
-  once(setup_rtcd_internal);
-}
+void vp9_rtcd() { once(setup_rtcd_internal); }
diff --git a/libvpx/vp9/common/vp9_rtcd_defs.pl b/libvpx/vp9/common/vp9_rtcd_defs.pl
index 6980b9b..4da0b66 100644
--- a/libvpx/vp9/common/vp9_rtcd_defs.pl
+++ b/libvpx/vp9/common/vp9_rtcd_defs.pl
@@ -128,10 +128,10 @@
 
 add_proto qw/int64_t vp9_block_error_fp/, "const tran_low_t *coeff, const tran_low_t *dqcoeff, int block_size";
 
-add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp neon sse2 avx2 vsx/, "$ssse3_x86_64";
 
-add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+add_proto qw/void vp9_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 specialize qw/vp9_quantize_fp_32x32 neon vsx/, "$ssse3_x86_64";
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -159,9 +159,9 @@
 
 # Note that there are more specializations appended when CONFIG_VP9_HIGHBITDEPTH
 # is off.
-specialize qw/vp9_fht4x4 sse2/;
-specialize qw/vp9_fht8x8 sse2/;
-specialize qw/vp9_fht16x16 sse2/;
+specialize qw/vp9_fht4x4 sse2 neon/;
+specialize qw/vp9_fht8x8 sse2 neon/;
+specialize qw/vp9_fht16x16 sse2 neon/;
 specialize qw/vp9_fwht4x4 sse2/;
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") ne "yes") {
   # Note that these specializations are appended to the above ones.
@@ -195,9 +195,9 @@
 
   # ENCODEMB INVOKE
 
-  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  add_proto qw/void vp9_highbd_quantize_fp/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
 
-  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
+  add_proto qw/void vp9_highbd_quantize_fp_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan" ;
 
   # fdct functions
   add_proto qw/void vp9_highbd_fht4x4/, "const int16_t *input, tran_low_t *output, int stride, int tx_type";
diff --git a/libvpx/vp9/common/vp9_seg_common.h b/libvpx/vp9/common/vp9_seg_common.h
index b63e4f4..5e71c2f 100644
--- a/libvpx/vp9/common/vp9_seg_common.h
+++ b/libvpx/vp9/common/vp9_seg_common.h
@@ -25,6 +25,11 @@
 
 #define PREDICTION_PROBS 3
 
+// Segment ID used to skip background encoding
+#define BACKGROUND_SEG_SKIP_ID 3
+// Number of frames that don't skip after a key frame
+#define FRAMES_NO_SKIPPING_AFTER_KEY 20
+
 // Segment level features.
 typedef enum {
   SEG_LVL_ALT_Q = 0,      // Use alternate Quantizer ....
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
new file mode 100644
index 0000000..a07a160
--- /dev/null
+++ b/libvpx/vp9/encoder/arm/neon/vp9_dct_neon.c
@@ -0,0 +1,1460 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <arm_neon.h>
+
+#include "./vpx_config.h"
+#include "./vp9_rtcd.h"
+#include "./vpx_dsp_rtcd.h"
+
+#include "vpx_dsp/txfm_common.h"
+#include "vpx_dsp/arm/mem_neon.h"
+#include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
+
+static INLINE void load_buffer_4x4(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  // { 0, 1, 1, 1, 1, 1, 1, 1 };
+  const int16x8_t nonzero_bias_a = vextq_s16(vdupq_n_s16(0), vdupq_n_s16(1), 7);
+  // { 1, 0, 0, 0, 0, 0, 0, 0 };
+  const int16x8_t nonzero_bias_b = vextq_s16(vdupq_n_s16(1), vdupq_n_s16(0), 7);
+  int16x8_t mask;
+
+  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+
+  in[0] = vcombine_s16(input_0, input_1);
+  in[1] = vcombine_s16(input_2, input_3);
+
+  // Copy the SSE method, use a mask to avoid an 'if' branch here to increase by
+  // one non-zero first elements
+  mask = vreinterpretq_s16_u16(vceqq_s16(in[0], nonzero_bias_a));
+  in[0] = vaddq_s16(in[0], mask);
+  in[0] = vaddq_s16(in[0], nonzero_bias_b);
+}
+
+static INLINE void write_buffer_4x4(tran_low_t *output, int16x8_t *res) {
+  const int16x8_t one_s16 = vdupq_n_s16(1);
+  res[0] = vaddq_s16(res[0], one_s16);
+  res[1] = vaddq_s16(res[1], one_s16);
+  res[0] = vshrq_n_s16(res[0], 2);
+  res[1] = vshrq_n_s16(res[1], 2);
+  store_s16q_to_tran_low(output + 0 * 8, res[0]);
+  store_s16q_to_tran_low(output + 1 * 8, res[1]);
+}
+
+static INLINE void fadst4x4_neon(int16x8_t *in) {
+  int32x4_t u0, u1, u2, u3;
+  int16x4_t out_0, out_1, out_2, out_3;
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  const int16x4_t s0 = vget_low_s16(in[0]);   // | x_00 | x_01 | x_02 | x_03 |
+  const int16x4_t s1 = vget_high_s16(in[0]);  // | x_10 | x_11 | x_12 | x_13 |
+  const int16x4_t s2 = vget_low_s16(in[1]);   // | x_20 | x_21 | x_22 | x_23 |
+  const int16x4_t s3 = vget_high_s16(in[1]);  // | x_30 | x_31 | x_32 | x_33 |
+
+  // s0 * sinpi_1_9, s0 * sinpi_4_9
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  const int32x4_t s0s1_9 = vmull_n_s16(s0, sinpi_1_9);
+  const int32x4_t s0s4_9 = vmull_n_s16(s0, sinpi_4_9);
+  // s1 * sinpi_1_9, s1 * sinpi_2_9
+  const int32x4_t s1s1_9 = vmull_n_s16(s1, sinpi_1_9);
+  const int32x4_t s1s2_9 = vmull_n_s16(s1, sinpi_2_9);
+  // s2 * sinpi_3_9
+  const int32x4_t s2s3_9 = vmull_n_s16(s2, sinpi_3_9);
+  // s3 * sinpi_2_9, s3 * sinpi_4_9
+  const int32x4_t s3s2_9 = vmull_n_s16(s3, sinpi_2_9);
+  const int32x4_t s3s4_9 = vmull_n_s16(s3, sinpi_4_9);
+
+  // (s0 + s1) * sinpi_3_9
+  const int32x4_t s0_p_s1 = vaddl_s16(s0, s1);
+  const int32x4_t s0_p_s1_m_s3 = vsubw_s16(s0_p_s1, s3);
+
+  // s_0 * sinpi_1_9 + s_1 * sinpi_2_9
+  // s_0 * sinpi_4_9 - s_1 * sinpi_1_9
+  const int32x4_t s0s1_9_p_s1s2_9 = vaddq_s32(s0s1_9, s1s2_9);
+  const int32x4_t s0s4_9_m_s1s1_9 = vsubq_s32(s0s4_9, s1s1_9);
+  /*
+   * t0 = s0s1_9 + s1s2_9 + s3s4_9
+   * t1 = (s0 + s1) * sinpi_3_9 - s3 * sinpi_3_9
+   * t2 = s0s4_9 - s1s1_9 + s3s2_9
+   * t3 = s2s3_9
+   */
+  const int32x4_t t0 = vaddq_s32(s0s1_9_p_s1s2_9, s3s4_9);
+  const int32x4_t t1 = vmulq_n_s32(s0_p_s1_m_s3, sinpi_3_9);
+  const int32x4_t t2 = vaddq_s32(s0s4_9_m_s1s1_9, s3s2_9);
+  const int32x4_t t3 = s2s3_9;
+  /*
+   * u0 = t0 + t3
+   * u1 = t1
+   * u2 = t2 - t3
+   * u3 = t2 - t0 + t3
+   */
+  u0 = vaddq_s32(t0, t3);
+  u1 = t1;
+  u2 = vsubq_s32(t2, t3);
+  u3 = vaddq_s32(vsubq_s32(t2, t0), t3);
+
+  // fdct_round_shift
+  u0 = vaddq_s32(u0, k__DCT_CONST_ROUNDING);
+  u1 = vaddq_s32(u1, k__DCT_CONST_ROUNDING);
+  u2 = vaddq_s32(u2, k__DCT_CONST_ROUNDING);
+  u3 = vaddq_s32(u3, k__DCT_CONST_ROUNDING);
+
+  out_0 = vshrn_n_s32(u0, DCT_CONST_BITS);
+  out_1 = vshrn_n_s32(u1, DCT_CONST_BITS);
+  out_2 = vshrn_n_s32(u2, DCT_CONST_BITS);
+  out_3 = vshrn_n_s32(u3, DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+  in[0] = vcombine_s16(out_0, out_1);
+  in[1] = vcombine_s16(out_2, out_3);
+}
+
+void vp9_fht4x4_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[2];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct4x4_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      write_buffer_4x4(output, in);
+      break;
+    case DCT_ADST:
+      load_buffer_4x4(input, in, stride);
+      vpx_fdct4x4_pass1_neon((int16x4_t *)in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_4x4(input, in, stride);
+      fadst4x4_neon(in);
+      fadst4x4_neon(in);
+      write_buffer_4x4(output, in);
+      break;
+  }
+}
+
+static INLINE void load_buffer_8x8(const int16_t *input, int16x8_t *in,
+                                   int stride) {
+  in[0] = vshlq_n_s16(vld1q_s16(input + 0 * stride), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(input + 1 * stride), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(input + 2 * stride), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(input + 3 * stride), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(input + 4 * stride), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(input + 5 * stride), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(input + 6 * stride), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(input + 7 * stride), 2);
+}
+
+/* right shift and rounding
+ * first get the sign bit (bit 15).
+ * If bit == 1, it's the simple case of shifting right by one bit.
+ * If bit == 2, it essentially computes the expression:
+ *
+ * out[j * 16 + i] = (temp_out[j] + 1 + (temp_out[j] < 0)) >> 2;
+ *
+ * for each row.
+ */
+static INLINE void right_shift_8x8(int16x8_t *res, const int bit) {
+  int16x8_t sign0 = vshrq_n_s16(res[0], 15);
+  int16x8_t sign1 = vshrq_n_s16(res[1], 15);
+  int16x8_t sign2 = vshrq_n_s16(res[2], 15);
+  int16x8_t sign3 = vshrq_n_s16(res[3], 15);
+  int16x8_t sign4 = vshrq_n_s16(res[4], 15);
+  int16x8_t sign5 = vshrq_n_s16(res[5], 15);
+  int16x8_t sign6 = vshrq_n_s16(res[6], 15);
+  int16x8_t sign7 = vshrq_n_s16(res[7], 15);
+
+  if (bit == 2) {
+    const int16x8_t const_rounding = vdupq_n_s16(1);
+    res[0] = vaddq_s16(res[0], const_rounding);
+    res[1] = vaddq_s16(res[1], const_rounding);
+    res[2] = vaddq_s16(res[2], const_rounding);
+    res[3] = vaddq_s16(res[3], const_rounding);
+    res[4] = vaddq_s16(res[4], const_rounding);
+    res[5] = vaddq_s16(res[5], const_rounding);
+    res[6] = vaddq_s16(res[6], const_rounding);
+    res[7] = vaddq_s16(res[7], const_rounding);
+  }
+
+  res[0] = vsubq_s16(res[0], sign0);
+  res[1] = vsubq_s16(res[1], sign1);
+  res[2] = vsubq_s16(res[2], sign2);
+  res[3] = vsubq_s16(res[3], sign3);
+  res[4] = vsubq_s16(res[4], sign4);
+  res[5] = vsubq_s16(res[5], sign5);
+  res[6] = vsubq_s16(res[6], sign6);
+  res[7] = vsubq_s16(res[7], sign7);
+
+  if (bit == 1) {
+    res[0] = vshrq_n_s16(res[0], 1);
+    res[1] = vshrq_n_s16(res[1], 1);
+    res[2] = vshrq_n_s16(res[2], 1);
+    res[3] = vshrq_n_s16(res[3], 1);
+    res[4] = vshrq_n_s16(res[4], 1);
+    res[5] = vshrq_n_s16(res[5], 1);
+    res[6] = vshrq_n_s16(res[6], 1);
+    res[7] = vshrq_n_s16(res[7], 1);
+  } else {
+    res[0] = vshrq_n_s16(res[0], 2);
+    res[1] = vshrq_n_s16(res[1], 2);
+    res[2] = vshrq_n_s16(res[2], 2);
+    res[3] = vshrq_n_s16(res[3], 2);
+    res[4] = vshrq_n_s16(res[4], 2);
+    res[5] = vshrq_n_s16(res[5], 2);
+    res[6] = vshrq_n_s16(res[6], 2);
+    res[7] = vshrq_n_s16(res[7], 2);
+  }
+}
+
+static INLINE void write_buffer_8x8(tran_low_t *output, int16x8_t *res,
+                                    int stride) {
+  store_s16q_to_tran_low(output + 0 * stride, res[0]);
+  store_s16q_to_tran_low(output + 1 * stride, res[1]);
+  store_s16q_to_tran_low(output + 2 * stride, res[2]);
+  store_s16q_to_tran_low(output + 3 * stride, res[3]);
+  store_s16q_to_tran_low(output + 4 * stride, res[4]);
+  store_s16q_to_tran_low(output + 5 * stride, res[5]);
+  store_s16q_to_tran_low(output + 6 * stride, res[6]);
+  store_s16q_to_tran_low(output + 7 * stride, res[7]);
+}
+
+static INLINE void fadst8x8_neon(int16x8_t *in) {
+  int16x4_t x0_lo, x0_hi, x1_lo, x1_hi, x2_lo, x2_hi, x3_lo, x3_hi, x4_lo,
+      x4_hi, x5_lo, x5_hi, x6_lo, x6_hi, x7_lo, x7_hi;
+  int32x4_t s0_lo, s0_hi, s1_lo, s1_hi, s2_lo, s2_hi, s3_lo, s3_hi, s4_lo,
+      s4_hi, s5_lo, s5_hi, s6_lo, s6_hi, s7_lo, s7_hi;
+  int32x4_t t0_lo, t0_hi, t1_lo, t1_hi, t2_lo, t2_hi, t3_lo, t3_hi, t4_lo,
+      t4_hi, t5_lo, t5_hi, t6_lo, t6_hi, t7_lo, t7_hi;
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  x0_lo = vget_low_s16(in[7]);
+  x0_hi = vget_high_s16(in[7]);
+  x1_lo = vget_low_s16(in[0]);
+  x1_hi = vget_high_s16(in[0]);
+  x2_lo = vget_low_s16(in[5]);
+  x2_hi = vget_high_s16(in[5]);
+  x3_lo = vget_low_s16(in[2]);
+  x3_hi = vget_high_s16(in[2]);
+  x4_lo = vget_low_s16(in[3]);
+  x4_hi = vget_high_s16(in[3]);
+  x5_lo = vget_low_s16(in[4]);
+  x5_hi = vget_high_s16(in[4]);
+  x6_lo = vget_low_s16(in[1]);
+  x6_hi = vget_high_s16(in[1]);
+  x7_lo = vget_low_s16(in[6]);
+  x7_hi = vget_high_s16(in[6]);
+
+  // stage 1
+  // s0 = cospi_2_64 * x0 + cospi_30_64 * x1;
+  s0_lo = vaddq_s32(vmull_n_s16(x0_lo, cospi_2_64),
+                    vmull_n_s16(x1_lo, cospi_30_64));
+  s0_hi = vaddq_s32(vmull_n_s16(x0_hi, cospi_2_64),
+                    vmull_n_s16(x1_hi, cospi_30_64));
+  // s1 = cospi_30_64 * x0 - cospi_2_64 * x1;
+  s1_lo = vsubq_s32(vmull_n_s16(x0_lo, cospi_30_64),
+                    vmull_n_s16(x1_lo, cospi_2_64));
+  s1_hi = vsubq_s32(vmull_n_s16(x0_hi, cospi_30_64),
+                    vmull_n_s16(x1_hi, cospi_2_64));
+  // s2 = cospi_10_64 * x2 + cospi_22_64 * x3;
+  s2_lo = vaddq_s32(vmull_n_s16(x2_lo, cospi_10_64),
+                    vmull_n_s16(x3_lo, cospi_22_64));
+  s2_hi = vaddq_s32(vmull_n_s16(x2_hi, cospi_10_64),
+                    vmull_n_s16(x3_hi, cospi_22_64));
+  // s3 = cospi_22_64 * x2 - cospi_10_64 * x3;
+  s3_lo = vsubq_s32(vmull_n_s16(x2_lo, cospi_22_64),
+                    vmull_n_s16(x3_lo, cospi_10_64));
+  s3_hi = vsubq_s32(vmull_n_s16(x2_hi, cospi_22_64),
+                    vmull_n_s16(x3_hi, cospi_10_64));
+  // s4 = cospi_18_64 * x4 + cospi_14_64 * x5;
+  s4_lo = vaddq_s32(vmull_n_s16(x4_lo, cospi_18_64),
+                    vmull_n_s16(x5_lo, cospi_14_64));
+  s4_hi = vaddq_s32(vmull_n_s16(x4_hi, cospi_18_64),
+                    vmull_n_s16(x5_hi, cospi_14_64));
+  // s5 = cospi_14_64 * x4 - cospi_18_64 * x5;
+  s5_lo = vsubq_s32(vmull_n_s16(x4_lo, cospi_14_64),
+                    vmull_n_s16(x5_lo, cospi_18_64));
+  s5_hi = vsubq_s32(vmull_n_s16(x4_hi, cospi_14_64),
+                    vmull_n_s16(x5_hi, cospi_18_64));
+  // s6 = cospi_26_64 * x6 + cospi_6_64 * x7;
+  s6_lo = vaddq_s32(vmull_n_s16(x6_lo, cospi_26_64),
+                    vmull_n_s16(x7_lo, cospi_6_64));
+  s6_hi = vaddq_s32(vmull_n_s16(x6_hi, cospi_26_64),
+                    vmull_n_s16(x7_hi, cospi_6_64));
+  // s7 = cospi_6_64 * x6 - cospi_26_64 * x7;
+  s7_lo = vsubq_s32(vmull_n_s16(x6_lo, cospi_6_64),
+                    vmull_n_s16(x7_lo, cospi_26_64));
+  s7_hi = vsubq_s32(vmull_n_s16(x6_hi, cospi_6_64),
+                    vmull_n_s16(x7_hi, cospi_26_64));
+
+  // fdct_round_shift
+  t0_lo = vaddq_s32(s0_lo, s4_lo);
+  t0_hi = vaddq_s32(s0_hi, s4_hi);
+  t1_lo = vaddq_s32(s1_lo, s5_lo);
+  t1_hi = vaddq_s32(s1_hi, s5_hi);
+  t2_lo = vaddq_s32(s2_lo, s6_lo);
+  t2_hi = vaddq_s32(s2_hi, s6_hi);
+  t3_lo = vaddq_s32(s3_lo, s7_lo);
+  t3_hi = vaddq_s32(s3_hi, s7_hi);
+  t4_lo = vsubq_s32(s0_lo, s4_lo);
+  t4_hi = vsubq_s32(s0_hi, s4_hi);
+  t5_lo = vsubq_s32(s1_lo, s5_lo);
+  t5_hi = vsubq_s32(s1_hi, s5_hi);
+  t6_lo = vsubq_s32(s2_lo, s6_lo);
+  t6_hi = vsubq_s32(s2_hi, s6_hi);
+  t7_lo = vsubq_s32(s3_lo, s7_lo);
+  t7_hi = vsubq_s32(s3_hi, s7_hi);
+
+  t0_lo = vaddq_s32(t0_lo, k__DCT_CONST_ROUNDING);
+  t0_hi = vaddq_s32(t0_hi, k__DCT_CONST_ROUNDING);
+  t1_lo = vaddq_s32(t1_lo, k__DCT_CONST_ROUNDING);
+  t1_hi = vaddq_s32(t1_hi, k__DCT_CONST_ROUNDING);
+  t2_lo = vaddq_s32(t2_lo, k__DCT_CONST_ROUNDING);
+  t2_hi = vaddq_s32(t2_hi, k__DCT_CONST_ROUNDING);
+  t3_lo = vaddq_s32(t3_lo, k__DCT_CONST_ROUNDING);
+  t3_hi = vaddq_s32(t3_hi, k__DCT_CONST_ROUNDING);
+  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
+  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
+  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
+  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
+
+  t0_lo = vshrq_n_s32(t0_lo, DCT_CONST_BITS);
+  t0_hi = vshrq_n_s32(t0_hi, DCT_CONST_BITS);
+  t1_lo = vshrq_n_s32(t1_lo, DCT_CONST_BITS);
+  t1_hi = vshrq_n_s32(t1_hi, DCT_CONST_BITS);
+  t2_lo = vshrq_n_s32(t2_lo, DCT_CONST_BITS);
+  t2_hi = vshrq_n_s32(t2_hi, DCT_CONST_BITS);
+  t3_lo = vshrq_n_s32(t3_lo, DCT_CONST_BITS);
+  t3_hi = vshrq_n_s32(t3_hi, DCT_CONST_BITS);
+  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
+  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
+  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
+  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
+  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
+  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
+  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
+  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // stage 2
+  s0_lo = t0_lo;
+  s0_hi = t0_hi;
+  s1_lo = t1_lo;
+  s1_hi = t1_hi;
+  s2_lo = t2_lo;
+  s2_hi = t2_hi;
+  s3_lo = t3_lo;
+  s3_hi = t3_hi;
+  s4_lo = vaddq_s32(vmulq_n_s32(t4_lo, cospi_8_64),
+                    vmulq_n_s32(t5_lo, cospi_24_64));
+  s4_hi = vaddq_s32(vmulq_n_s32(t4_hi, cospi_8_64),
+                    vmulq_n_s32(t5_hi, cospi_24_64));
+  s5_lo = vsubq_s32(vmulq_n_s32(t4_lo, cospi_24_64),
+                    vmulq_n_s32(t5_lo, cospi_8_64));
+  s5_hi = vsubq_s32(vmulq_n_s32(t4_hi, cospi_24_64),
+                    vmulq_n_s32(t5_hi, cospi_8_64));
+  s6_lo = vaddq_s32(vmulq_n_s32(t6_lo, -cospi_24_64),
+                    vmulq_n_s32(t7_lo, cospi_8_64));
+  s6_hi = vaddq_s32(vmulq_n_s32(t6_hi, -cospi_24_64),
+                    vmulq_n_s32(t7_hi, cospi_8_64));
+  s7_lo = vaddq_s32(vmulq_n_s32(t6_lo, cospi_8_64),
+                    vmulq_n_s32(t7_lo, cospi_24_64));
+  s7_hi = vaddq_s32(vmulq_n_s32(t6_hi, cospi_8_64),
+                    vmulq_n_s32(t7_hi, cospi_24_64));
+
+  // s0 + s2
+  t0_lo = vaddq_s32(s0_lo, s2_lo);
+  t0_hi = vaddq_s32(s0_hi, s2_hi);
+  // s1 + s3
+  t1_lo = vaddq_s32(s1_lo, s3_lo);
+  t1_hi = vaddq_s32(s1_hi, s3_hi);
+  // s0 - s2
+  t2_lo = vsubq_s32(s0_lo, s2_lo);
+  t2_hi = vsubq_s32(s0_hi, s2_hi);
+  // s1 - s3
+  t3_lo = vsubq_s32(s1_lo, s3_lo);
+  t3_hi = vsubq_s32(s1_hi, s3_hi);
+  // s4 + s6
+  t4_lo = vaddq_s32(s4_lo, s6_lo);
+  t4_hi = vaddq_s32(s4_hi, s6_hi);
+  // s5 + s7
+  t5_lo = vaddq_s32(s5_lo, s7_lo);
+  t5_hi = vaddq_s32(s5_hi, s7_hi);
+  // s4 - s6
+  t6_lo = vsubq_s32(s4_lo, s6_lo);
+  t6_hi = vsubq_s32(s4_hi, s6_hi);
+  // s5 - s7
+  t7_lo = vsubq_s32(s5_lo, s7_lo);
+  t7_hi = vsubq_s32(s5_hi, s7_hi);
+
+  // fdct_round_shift
+  t4_lo = vaddq_s32(t4_lo, k__DCT_CONST_ROUNDING);
+  t4_hi = vaddq_s32(t4_hi, k__DCT_CONST_ROUNDING);
+  t5_lo = vaddq_s32(t5_lo, k__DCT_CONST_ROUNDING);
+  t5_hi = vaddq_s32(t5_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(t6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(t6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(t7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(t7_hi, k__DCT_CONST_ROUNDING);
+  t4_lo = vshrq_n_s32(t4_lo, DCT_CONST_BITS);
+  t4_hi = vshrq_n_s32(t4_hi, DCT_CONST_BITS);
+  t5_lo = vshrq_n_s32(t5_lo, DCT_CONST_BITS);
+  t5_hi = vshrq_n_s32(t5_hi, DCT_CONST_BITS);
+  t6_lo = vshrq_n_s32(t6_lo, DCT_CONST_BITS);
+  t6_hi = vshrq_n_s32(t6_hi, DCT_CONST_BITS);
+  t7_lo = vshrq_n_s32(t7_lo, DCT_CONST_BITS);
+  t7_hi = vshrq_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // stage 3
+  // cospi_16_64 * (x2 + x3)
+  s2_lo = vmulq_n_s32(vaddq_s32(t2_lo, t3_lo), cospi_16_64);
+  s2_hi = vmulq_n_s32(vaddq_s32(t2_hi, t3_hi), cospi_16_64);
+  // cospi_16_64 * (x2 - x3)
+  s3_lo = vmulq_n_s32(vsubq_s32(t2_lo, t3_lo), cospi_16_64);
+  s3_hi = vmulq_n_s32(vsubq_s32(t2_hi, t3_hi), cospi_16_64);
+  // cospi_16_64 * (x6 + x7)
+  s6_lo = vmulq_n_s32(vaddq_s32(t6_lo, t7_lo), cospi_16_64);
+  s6_hi = vmulq_n_s32(vaddq_s32(t6_hi, t7_hi), cospi_16_64);
+  // cospi_16_64 * (x2 - x3)
+  s7_lo = vmulq_n_s32(vsubq_s32(t6_lo, t7_lo), cospi_16_64);
+  s7_hi = vmulq_n_s32(vsubq_s32(t6_hi, t7_hi), cospi_16_64);
+
+  // final fdct_round_shift
+  t2_lo = vaddq_s32(s2_lo, k__DCT_CONST_ROUNDING);
+  t2_hi = vaddq_s32(s2_hi, k__DCT_CONST_ROUNDING);
+  t3_lo = vaddq_s32(s3_lo, k__DCT_CONST_ROUNDING);
+  t3_hi = vaddq_s32(s3_hi, k__DCT_CONST_ROUNDING);
+  t6_lo = vaddq_s32(s6_lo, k__DCT_CONST_ROUNDING);
+  t6_hi = vaddq_s32(s6_hi, k__DCT_CONST_ROUNDING);
+  t7_lo = vaddq_s32(s7_lo, k__DCT_CONST_ROUNDING);
+  t7_hi = vaddq_s32(s7_hi, k__DCT_CONST_ROUNDING);
+
+  x2_lo = vshrn_n_s32(t2_lo, DCT_CONST_BITS);
+  x2_hi = vshrn_n_s32(t2_hi, DCT_CONST_BITS);
+  x3_lo = vshrn_n_s32(t3_lo, DCT_CONST_BITS);
+  x3_hi = vshrn_n_s32(t3_hi, DCT_CONST_BITS);
+  x6_lo = vshrn_n_s32(t6_lo, DCT_CONST_BITS);
+  x6_hi = vshrn_n_s32(t6_hi, DCT_CONST_BITS);
+  x7_lo = vshrn_n_s32(t7_lo, DCT_CONST_BITS);
+  x7_hi = vshrn_n_s32(t7_hi, DCT_CONST_BITS);
+
+  // x0, x1, x4, x5 narrow down to 16-bits directly
+  x0_lo = vmovn_s32(t0_lo);
+  x0_hi = vmovn_s32(t0_hi);
+  x1_lo = vmovn_s32(t1_lo);
+  x1_hi = vmovn_s32(t1_hi);
+  x4_lo = vmovn_s32(t4_lo);
+  x4_hi = vmovn_s32(t4_hi);
+  x5_lo = vmovn_s32(t5_lo);
+  x5_hi = vmovn_s32(t5_hi);
+
+  in[0] = vcombine_s16(x0_lo, x0_hi);
+  in[1] = vnegq_s16(vcombine_s16(x4_lo, x4_hi));
+  in[2] = vcombine_s16(x6_lo, x6_hi);
+  in[3] = vnegq_s16(vcombine_s16(x2_lo, x2_hi));
+  in[4] = vcombine_s16(x3_lo, x3_hi);
+  in[5] = vnegq_s16(vcombine_s16(x7_lo, x7_hi));
+  in[6] = vcombine_s16(x5_lo, x5_hi);
+  in[7] = vnegq_s16(vcombine_s16(x1_lo, x1_hi));
+
+  transpose_s16_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
+                    &in[7]);
+}
+
+void vp9_fht8x8_neon(const int16_t *input, tran_low_t *output, int stride,
+                     int tx_type) {
+  int16x8_t in[8];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct8x8_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      vpx_fdct8x8_pass1_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    case DCT_ADST:
+      load_buffer_8x8(input, in, stride);
+      vpx_fdct8x8_pass1_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_8x8(input, in, stride);
+      fadst8x8_neon(in);
+      fadst8x8_neon(in);
+      right_shift_8x8(in, 1);
+      write_buffer_8x8(output, in, 8);
+      break;
+  }
+}
+
+static INLINE void load_buffer_16x16(const int16_t *input, int16x8_t *in0,
+                                     int16x8_t *in1, int stride) {
+  // load first 8 columns
+  load_buffer_8x8(input, in0, stride);
+  load_buffer_8x8(input + 8 * stride, in0 + 8, stride);
+
+  input += 8;
+  // load second 8 columns
+  load_buffer_8x8(input, in1, stride);
+  load_buffer_8x8(input + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void write_buffer_16x16(tran_low_t *output, int16x8_t *in0,
+                                      int16x8_t *in1, int stride) {
+  // write first 8 columns
+  write_buffer_8x8(output, in0, stride);
+  write_buffer_8x8(output + 8 * stride, in0 + 8, stride);
+
+  // write second 8 columns
+  output += 8;
+  write_buffer_8x8(output, in1, stride);
+  write_buffer_8x8(output + 8 * stride, in1 + 8, stride);
+}
+
+static INLINE void right_shift_16x16(int16x8_t *res0, int16x8_t *res1) {
+  // perform rounding operations
+  right_shift_8x8(res0, 2);
+  right_shift_8x8(res0 + 8, 2);
+  right_shift_8x8(res1, 2);
+  right_shift_8x8(res1 + 8, 2);
+}
+
+static void fdct16_8col(int16x8_t *in) {
+  // perform 16x16 1-D DCT for 8 columns
+  int16x8_t i[8], s1[8], s2[8], s3[8], t[8];
+  int16x4_t t_lo[8], t_hi[8];
+  int32x4_t u_lo[8], u_hi[8];
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  // stage 1
+  i[0] = vaddq_s16(in[0], in[15]);
+  i[1] = vaddq_s16(in[1], in[14]);
+  i[2] = vaddq_s16(in[2], in[13]);
+  i[3] = vaddq_s16(in[3], in[12]);
+  i[4] = vaddq_s16(in[4], in[11]);
+  i[5] = vaddq_s16(in[5], in[10]);
+  i[6] = vaddq_s16(in[6], in[9]);
+  i[7] = vaddq_s16(in[7], in[8]);
+
+  vpx_fdct8x8_pass1_neon(i);
+  transpose_s16_8x8(&i[0], &i[1], &i[2], &i[3], &i[4], &i[5], &i[6], &i[7]);
+
+  // step 2
+  s1[0] = vsubq_s16(in[7], in[8]);
+  s1[1] = vsubq_s16(in[6], in[9]);
+  s1[2] = vsubq_s16(in[5], in[10]);
+  s1[3] = vsubq_s16(in[4], in[11]);
+  s1[4] = vsubq_s16(in[3], in[12]);
+  s1[5] = vsubq_s16(in[2], in[13]);
+  s1[6] = vsubq_s16(in[1], in[14]);
+  s1[7] = vsubq_s16(in[0], in[15]);
+
+  t[2] = vsubq_s16(s1[5], s1[2]);
+  t[3] = vsubq_s16(s1[4], s1[3]);
+  t[4] = vaddq_s16(s1[4], s1[3]);
+  t[5] = vaddq_s16(s1[5], s1[2]);
+
+  t_lo[2] = vget_low_s16(t[2]);
+  t_hi[2] = vget_high_s16(t[2]);
+  t_lo[3] = vget_low_s16(t[3]);
+  t_hi[3] = vget_high_s16(t[3]);
+  t_lo[4] = vget_low_s16(t[4]);
+  t_hi[4] = vget_high_s16(t[4]);
+  t_lo[5] = vget_low_s16(t[5]);
+  t_hi[5] = vget_high_s16(t[5]);
+
+  u_lo[2] = vmull_n_s16(t_lo[2], cospi_16_64);
+  u_hi[2] = vmull_n_s16(t_hi[2], cospi_16_64);
+  u_lo[3] = vmull_n_s16(t_lo[3], cospi_16_64);
+  u_hi[3] = vmull_n_s16(t_hi[3], cospi_16_64);
+  u_lo[4] = vmull_n_s16(t_lo[4], cospi_16_64);
+  u_hi[4] = vmull_n_s16(t_hi[4], cospi_16_64);
+  u_lo[5] = vmull_n_s16(t_lo[5], cospi_16_64);
+  u_hi[5] = vmull_n_s16(t_hi[5], cospi_16_64);
+
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
+  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
+  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
+  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[3] = vcombine_s16(t_lo[3], t_hi[3]);
+  s2[4] = vcombine_s16(t_lo[4], t_hi[4]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+
+  // step 3
+  s3[0] = vaddq_s16(s1[0], s2[3]);
+  s3[1] = vaddq_s16(s1[1], s2[2]);
+  s3[2] = vsubq_s16(s1[1], s2[2]);
+  s3[3] = vsubq_s16(s1[0], s2[3]);
+  s3[4] = vsubq_s16(s1[7], s2[4]);
+  s3[5] = vsubq_s16(s1[6], s2[5]);
+  s3[6] = vaddq_s16(s1[6], s2[5]);
+  s3[7] = vaddq_s16(s1[7], s2[4]);
+
+  // step 4
+  t_lo[0] = vget_low_s16(s3[0]);
+  t_hi[0] = vget_high_s16(s3[0]);
+  t_lo[1] = vget_low_s16(s3[1]);
+  t_hi[1] = vget_high_s16(s3[1]);
+  t_lo[2] = vget_low_s16(s3[2]);
+  t_hi[2] = vget_high_s16(s3[2]);
+  t_lo[3] = vget_low_s16(s3[3]);
+  t_hi[3] = vget_high_s16(s3[3]);
+  t_lo[4] = vget_low_s16(s3[4]);
+  t_hi[4] = vget_high_s16(s3[4]);
+  t_lo[5] = vget_low_s16(s3[5]);
+  t_hi[5] = vget_high_s16(s3[5]);
+  t_lo[6] = vget_low_s16(s3[6]);
+  t_hi[6] = vget_high_s16(s3[6]);
+  t_lo[7] = vget_low_s16(s3[7]);
+  t_hi[7] = vget_high_s16(s3[7]);
+
+  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_8_64),
+                      vmull_n_s16(t_lo[6], cospi_24_64));
+  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_8_64),
+                      vmull_n_s16(t_hi[6], cospi_24_64));
+  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_24_64),
+                      vmull_n_s16(t_lo[5], cospi_8_64));
+  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_24_64),
+                      vmull_n_s16(t_hi[5], cospi_8_64));
+  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_8_64),
+                      vmull_n_s16(t_lo[5], -cospi_24_64));
+  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_8_64),
+                      vmull_n_s16(t_hi[5], -cospi_24_64));
+  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_24_64),
+                      vmull_n_s16(t_lo[6], cospi_8_64));
+  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_24_64),
+                      vmull_n_s16(t_hi[6], cospi_8_64));
+
+  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
+  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
+  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
+
+  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+
+  s2[1] = vcombine_s16(t_lo[1], t_hi[1]);
+  s2[2] = vcombine_s16(t_lo[2], t_hi[2]);
+  s2[5] = vcombine_s16(t_lo[5], t_hi[5]);
+  s2[6] = vcombine_s16(t_lo[6], t_hi[6]);
+
+  // step 5
+  s1[0] = vaddq_s16(s3[0], s2[1]);
+  s1[1] = vsubq_s16(s3[0], s2[1]);
+  s1[2] = vaddq_s16(s3[3], s2[2]);
+  s1[3] = vsubq_s16(s3[3], s2[2]);
+  s1[4] = vsubq_s16(s3[4], s2[5]);
+  s1[5] = vaddq_s16(s3[4], s2[5]);
+  s1[6] = vsubq_s16(s3[7], s2[6]);
+  s1[7] = vaddq_s16(s3[7], s2[6]);
+
+  // step 6
+  t_lo[0] = vget_low_s16(s1[0]);
+  t_hi[0] = vget_high_s16(s1[0]);
+  t_lo[1] = vget_low_s16(s1[1]);
+  t_hi[1] = vget_high_s16(s1[1]);
+  t_lo[2] = vget_low_s16(s1[2]);
+  t_hi[2] = vget_high_s16(s1[2]);
+  t_lo[3] = vget_low_s16(s1[3]);
+  t_hi[3] = vget_high_s16(s1[3]);
+  t_lo[4] = vget_low_s16(s1[4]);
+  t_hi[4] = vget_high_s16(s1[4]);
+  t_lo[5] = vget_low_s16(s1[5]);
+  t_hi[5] = vget_high_s16(s1[5]);
+  t_lo[6] = vget_low_s16(s1[6]);
+  t_hi[6] = vget_high_s16(s1[6]);
+  t_lo[7] = vget_low_s16(s1[7]);
+  t_hi[7] = vget_high_s16(s1[7]);
+
+  // step1[0] * cospi_30_64 + step1[7] * cospi_2_64;
+  u_lo[0] = vaddq_s32(vmull_n_s16(t_lo[0], cospi_30_64),
+                      vmull_n_s16(t_lo[7], cospi_2_64));
+  u_hi[0] = vaddq_s32(vmull_n_s16(t_hi[0], cospi_30_64),
+                      vmull_n_s16(t_hi[7], cospi_2_64));
+
+  // step1[1] * cospi_14_64 + step1[6] * cospi_18_64;
+  u_lo[1] = vaddq_s32(vmull_n_s16(t_lo[1], cospi_14_64),
+                      vmull_n_s16(t_lo[6], cospi_18_64));
+  u_hi[1] = vaddq_s32(vmull_n_s16(t_hi[1], cospi_14_64),
+                      vmull_n_s16(t_hi[6], cospi_18_64));
+
+  // step1[2] * cospi_22_64 + step1[5] * cospi_10_64;
+  u_lo[2] = vaddq_s32(vmull_n_s16(t_lo[2], cospi_22_64),
+                      vmull_n_s16(t_lo[5], cospi_10_64));
+  u_hi[2] = vaddq_s32(vmull_n_s16(t_hi[2], cospi_22_64),
+                      vmull_n_s16(t_hi[5], cospi_10_64));
+
+  // step1[3] * cospi_6_64 + step1[4] * cospi_26_64;
+  u_lo[3] = vaddq_s32(vmull_n_s16(t_lo[3], cospi_6_64),
+                      vmull_n_s16(t_lo[4], cospi_26_64));
+  u_hi[3] = vaddq_s32(vmull_n_s16(t_hi[3], cospi_6_64),
+                      vmull_n_s16(t_hi[4], cospi_26_64));
+
+  // step1[3] * -cospi_26_64 + step1[4] * cospi_6_64;
+  u_lo[4] = vaddq_s32(vmull_n_s16(t_lo[3], -cospi_26_64),
+                      vmull_n_s16(t_lo[4], cospi_6_64));
+  u_hi[4] = vaddq_s32(vmull_n_s16(t_hi[3], -cospi_26_64),
+                      vmull_n_s16(t_hi[4], cospi_6_64));
+
+  // step1[2] * -cospi_10_64 + step1[5] * cospi_22_64;
+  u_lo[5] = vaddq_s32(vmull_n_s16(t_lo[2], -cospi_10_64),
+                      vmull_n_s16(t_lo[5], cospi_22_64));
+  u_hi[5] = vaddq_s32(vmull_n_s16(t_hi[2], -cospi_10_64),
+                      vmull_n_s16(t_hi[5], cospi_22_64));
+
+  // step1[1] * -cospi_18_64 + step1[6] * cospi_14_64;
+  u_lo[6] = vaddq_s32(vmull_n_s16(t_lo[1], -cospi_18_64),
+                      vmull_n_s16(t_lo[6], cospi_14_64));
+  u_hi[6] = vaddq_s32(vmull_n_s16(t_hi[1], -cospi_18_64),
+                      vmull_n_s16(t_hi[6], cospi_14_64));
+
+  // step1[0] * -cospi_2_64 + step1[7] * cospi_30_64;
+  u_lo[7] = vaddq_s32(vmull_n_s16(t_lo[0], -cospi_2_64),
+                      vmull_n_s16(t_lo[7], cospi_30_64));
+  u_hi[7] = vaddq_s32(vmull_n_s16(t_hi[0], -cospi_2_64),
+                      vmull_n_s16(t_hi[7], cospi_30_64));
+
+  // final fdct_round_shift
+  u_lo[0] = vaddq_s32(u_lo[0], k__DCT_CONST_ROUNDING);
+  u_hi[0] = vaddq_s32(u_hi[0], k__DCT_CONST_ROUNDING);
+  u_lo[1] = vaddq_s32(u_lo[1], k__DCT_CONST_ROUNDING);
+  u_hi[1] = vaddq_s32(u_hi[1], k__DCT_CONST_ROUNDING);
+  u_lo[2] = vaddq_s32(u_lo[2], k__DCT_CONST_ROUNDING);
+  u_hi[2] = vaddq_s32(u_hi[2], k__DCT_CONST_ROUNDING);
+  u_lo[3] = vaddq_s32(u_lo[3], k__DCT_CONST_ROUNDING);
+  u_hi[3] = vaddq_s32(u_hi[3], k__DCT_CONST_ROUNDING);
+  u_lo[4] = vaddq_s32(u_lo[4], k__DCT_CONST_ROUNDING);
+  u_hi[4] = vaddq_s32(u_hi[4], k__DCT_CONST_ROUNDING);
+  u_lo[5] = vaddq_s32(u_lo[5], k__DCT_CONST_ROUNDING);
+  u_hi[5] = vaddq_s32(u_hi[5], k__DCT_CONST_ROUNDING);
+  u_lo[6] = vaddq_s32(u_lo[6], k__DCT_CONST_ROUNDING);
+  u_hi[6] = vaddq_s32(u_hi[6], k__DCT_CONST_ROUNDING);
+  u_lo[7] = vaddq_s32(u_lo[7], k__DCT_CONST_ROUNDING);
+  u_hi[7] = vaddq_s32(u_hi[7], k__DCT_CONST_ROUNDING);
+
+  t_lo[0] = vshrn_n_s32(u_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vshrn_n_s32(u_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vshrn_n_s32(u_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrn_n_s32(u_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrn_n_s32(u_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrn_n_s32(u_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrn_n_s32(u_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrn_n_s32(u_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrn_n_s32(u_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrn_n_s32(u_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrn_n_s32(u_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrn_n_s32(u_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrn_n_s32(u_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrn_n_s32(u_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrn_n_s32(u_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrn_n_s32(u_hi[7], DCT_CONST_BITS);
+
+  in[0] = i[0];
+  in[2] = i[1];
+  in[4] = i[2];
+  in[6] = i[3];
+  in[8] = i[4];
+  in[10] = i[5];
+  in[12] = i[6];
+  in[14] = i[7];
+  in[1] = vcombine_s16(t_lo[0], t_hi[0]);
+  in[3] = vcombine_s16(t_lo[4], t_hi[4]);
+  in[5] = vcombine_s16(t_lo[2], t_hi[2]);
+  in[7] = vcombine_s16(t_lo[6], t_hi[6]);
+  in[9] = vcombine_s16(t_lo[1], t_hi[1]);
+  in[11] = vcombine_s16(t_lo[5], t_hi[5]);
+  in[13] = vcombine_s16(t_lo[3], t_hi[3]);
+  in[15] = vcombine_s16(t_lo[7], t_hi[7]);
+}
+
+static void fadst16_8col(int16x8_t *in) {
+  // perform 16x16 1-D ADST for 8 columns
+  int16x4_t x_lo[16], x_hi[16];
+  int32x4_t s_lo[16], s_hi[16];
+  int32x4_t t_lo[16], t_hi[16];
+  const int32x4_t k__DCT_CONST_ROUNDING = vdupq_n_s32(DCT_CONST_ROUNDING);
+
+  x_lo[0] = vget_low_s16(in[15]);
+  x_hi[0] = vget_high_s16(in[15]);
+  x_lo[1] = vget_low_s16(in[0]);
+  x_hi[1] = vget_high_s16(in[0]);
+  x_lo[2] = vget_low_s16(in[13]);
+  x_hi[2] = vget_high_s16(in[13]);
+  x_lo[3] = vget_low_s16(in[2]);
+  x_hi[3] = vget_high_s16(in[2]);
+  x_lo[4] = vget_low_s16(in[11]);
+  x_hi[4] = vget_high_s16(in[11]);
+  x_lo[5] = vget_low_s16(in[4]);
+  x_hi[5] = vget_high_s16(in[4]);
+  x_lo[6] = vget_low_s16(in[9]);
+  x_hi[6] = vget_high_s16(in[9]);
+  x_lo[7] = vget_low_s16(in[6]);
+  x_hi[7] = vget_high_s16(in[6]);
+  x_lo[8] = vget_low_s16(in[7]);
+  x_hi[8] = vget_high_s16(in[7]);
+  x_lo[9] = vget_low_s16(in[8]);
+  x_hi[9] = vget_high_s16(in[8]);
+  x_lo[10] = vget_low_s16(in[5]);
+  x_hi[10] = vget_high_s16(in[5]);
+  x_lo[11] = vget_low_s16(in[10]);
+  x_hi[11] = vget_high_s16(in[10]);
+  x_lo[12] = vget_low_s16(in[3]);
+  x_hi[12] = vget_high_s16(in[3]);
+  x_lo[13] = vget_low_s16(in[12]);
+  x_hi[13] = vget_high_s16(in[12]);
+  x_lo[14] = vget_low_s16(in[1]);
+  x_hi[14] = vget_high_s16(in[1]);
+  x_lo[15] = vget_low_s16(in[14]);
+  x_hi[15] = vget_high_s16(in[14]);
+
+  // stage 1
+  // s0 = cospi_1_64 * x0 + cospi_31_64 * x1;
+  s_lo[0] = vaddq_s32(vmull_n_s16(x_lo[0], cospi_1_64),
+                      vmull_n_s16(x_lo[1], cospi_31_64));
+  s_hi[0] = vaddq_s32(vmull_n_s16(x_hi[0], cospi_1_64),
+                      vmull_n_s16(x_hi[1], cospi_31_64));
+  // s1 = cospi_31_64 * x0 - cospi_1_64 * x1;
+  s_lo[1] = vsubq_s32(vmull_n_s16(x_lo[0], cospi_31_64),
+                      vmull_n_s16(x_lo[1], cospi_1_64));
+  s_hi[1] = vsubq_s32(vmull_n_s16(x_hi[0], cospi_31_64),
+                      vmull_n_s16(x_hi[1], cospi_1_64));
+  // s2 = cospi_5_64 * x2 + cospi_27_64 * x3;
+  s_lo[2] = vaddq_s32(vmull_n_s16(x_lo[2], cospi_5_64),
+                      vmull_n_s16(x_lo[3], cospi_27_64));
+  s_hi[2] = vaddq_s32(vmull_n_s16(x_hi[2], cospi_5_64),
+                      vmull_n_s16(x_hi[3], cospi_27_64));
+  // s3 = cospi_27_64 * x2 - cospi_5_64 * x3;
+  s_lo[3] = vsubq_s32(vmull_n_s16(x_lo[2], cospi_27_64),
+                      vmull_n_s16(x_lo[3], cospi_5_64));
+  s_hi[3] = vsubq_s32(vmull_n_s16(x_hi[2], cospi_27_64),
+                      vmull_n_s16(x_hi[3], cospi_5_64));
+  // s4 = cospi_9_64 * x4 + cospi_23_64 * x5;
+  s_lo[4] = vaddq_s32(vmull_n_s16(x_lo[4], cospi_9_64),
+                      vmull_n_s16(x_lo[5], cospi_23_64));
+  s_hi[4] = vaddq_s32(vmull_n_s16(x_hi[4], cospi_9_64),
+                      vmull_n_s16(x_hi[5], cospi_23_64));
+  // s5 = cospi_23_64 * x4 - cospi_9_64 * x5;
+  s_lo[5] = vsubq_s32(vmull_n_s16(x_lo[4], cospi_23_64),
+                      vmull_n_s16(x_lo[5], cospi_9_64));
+  s_hi[5] = vsubq_s32(vmull_n_s16(x_hi[4], cospi_23_64),
+                      vmull_n_s16(x_hi[5], cospi_9_64));
+  // s6 = cospi_13_64 * x6 + cospi_19_64 * x7;
+  s_lo[6] = vaddq_s32(vmull_n_s16(x_lo[6], cospi_13_64),
+                      vmull_n_s16(x_lo[7], cospi_19_64));
+  s_hi[6] = vaddq_s32(vmull_n_s16(x_hi[6], cospi_13_64),
+                      vmull_n_s16(x_hi[7], cospi_19_64));
+  // s7 = cospi_19_64 * x6 - cospi_13_64 * x7;
+  s_lo[7] = vsubq_s32(vmull_n_s16(x_lo[6], cospi_19_64),
+                      vmull_n_s16(x_lo[7], cospi_13_64));
+  s_hi[7] = vsubq_s32(vmull_n_s16(x_hi[6], cospi_19_64),
+                      vmull_n_s16(x_hi[7], cospi_13_64));
+  // s8 = cospi_17_64 * x8 + cospi_15_64 * x9;
+  s_lo[8] = vaddq_s32(vmull_n_s16(x_lo[8], cospi_17_64),
+                      vmull_n_s16(x_lo[9], cospi_15_64));
+  s_hi[8] = vaddq_s32(vmull_n_s16(x_hi[8], cospi_17_64),
+                      vmull_n_s16(x_hi[9], cospi_15_64));
+  // s9 = cospi_15_64 * x8 - cospi_17_64 * x9;
+  s_lo[9] = vsubq_s32(vmull_n_s16(x_lo[8], cospi_15_64),
+                      vmull_n_s16(x_lo[9], cospi_17_64));
+  s_hi[9] = vsubq_s32(vmull_n_s16(x_hi[8], cospi_15_64),
+                      vmull_n_s16(x_hi[9], cospi_17_64));
+  // s10 = cospi_21_64 * x10 + cospi_11_64 * x11;
+  s_lo[10] = vaddq_s32(vmull_n_s16(x_lo[10], cospi_21_64),
+                       vmull_n_s16(x_lo[11], cospi_11_64));
+  s_hi[10] = vaddq_s32(vmull_n_s16(x_hi[10], cospi_21_64),
+                       vmull_n_s16(x_hi[11], cospi_11_64));
+  // s11 = cospi_11_64 * x10 - cospi_21_64 * x11;
+  s_lo[11] = vsubq_s32(vmull_n_s16(x_lo[10], cospi_11_64),
+                       vmull_n_s16(x_lo[11], cospi_21_64));
+  s_hi[11] = vsubq_s32(vmull_n_s16(x_hi[10], cospi_11_64),
+                       vmull_n_s16(x_hi[11], cospi_21_64));
+  // s12 = cospi_25_64 * x12 + cospi_7_64 * x13;
+  s_lo[12] = vaddq_s32(vmull_n_s16(x_lo[12], cospi_25_64),
+                       vmull_n_s16(x_lo[13], cospi_7_64));
+  s_hi[12] = vaddq_s32(vmull_n_s16(x_hi[12], cospi_25_64),
+                       vmull_n_s16(x_hi[13], cospi_7_64));
+  // s13 = cospi_7_64 * x12 - cospi_25_64 * x13;
+  s_lo[13] = vsubq_s32(vmull_n_s16(x_lo[12], cospi_7_64),
+                       vmull_n_s16(x_lo[13], cospi_25_64));
+  s_hi[13] = vsubq_s32(vmull_n_s16(x_hi[12], cospi_7_64),
+                       vmull_n_s16(x_hi[13], cospi_25_64));
+  // s14 = cospi_29_64 * x14 + cospi_3_64 * x15;
+  s_lo[14] = vaddq_s32(vmull_n_s16(x_lo[14], cospi_29_64),
+                       vmull_n_s16(x_lo[15], cospi_3_64));
+  s_hi[14] = vaddq_s32(vmull_n_s16(x_hi[14], cospi_29_64),
+                       vmull_n_s16(x_hi[15], cospi_3_64));
+  // s15 = cospi_3_64 * x14 - cospi_29_64 * x15;
+  s_lo[15] = vsubq_s32(vmull_n_s16(x_lo[14], cospi_3_64),
+                       vmull_n_s16(x_lo[15], cospi_29_64));
+  s_hi[15] = vsubq_s32(vmull_n_s16(x_hi[14], cospi_3_64),
+                       vmull_n_s16(x_hi[15], cospi_29_64));
+
+  // fdct_round_shift
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[8]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[8]);
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[9]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[9]);
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[10]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[10]);
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[11]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[11]);
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[12]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[12]);
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[13]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[13]);
+  t_lo[6] = vaddq_s32(s_lo[6], s_lo[14]);
+  t_hi[6] = vaddq_s32(s_hi[6], s_hi[14]);
+  t_lo[7] = vaddq_s32(s_lo[7], s_lo[15]);
+  t_hi[7] = vaddq_s32(s_hi[7], s_hi[15]);
+  t_lo[8] = vsubq_s32(s_lo[0], s_lo[8]);
+  t_hi[8] = vsubq_s32(s_hi[0], s_hi[8]);
+  t_lo[9] = vsubq_s32(s_lo[1], s_lo[9]);
+  t_hi[9] = vsubq_s32(s_hi[1], s_hi[9]);
+  t_lo[10] = vsubq_s32(s_lo[2], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[2], s_hi[10]);
+  t_lo[11] = vsubq_s32(s_lo[3], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[3], s_hi[11]);
+  t_lo[12] = vsubq_s32(s_lo[4], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[4], s_hi[12]);
+  t_lo[13] = vsubq_s32(s_lo[5], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[5], s_hi[13]);
+  t_lo[14] = vsubq_s32(s_lo[6], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[6], s_hi[14]);
+  t_lo[15] = vsubq_s32(s_lo[7], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[7], s_hi[15]);
+
+  t_lo[0] = vaddq_s32(t_lo[0], k__DCT_CONST_ROUNDING);
+  t_hi[0] = vaddq_s32(t_hi[0], k__DCT_CONST_ROUNDING);
+  t_lo[1] = vaddq_s32(t_lo[1], k__DCT_CONST_ROUNDING);
+  t_hi[1] = vaddq_s32(t_hi[1], k__DCT_CONST_ROUNDING);
+  t_lo[2] = vaddq_s32(t_lo[2], k__DCT_CONST_ROUNDING);
+  t_hi[2] = vaddq_s32(t_hi[2], k__DCT_CONST_ROUNDING);
+  t_lo[3] = vaddq_s32(t_lo[3], k__DCT_CONST_ROUNDING);
+  t_hi[3] = vaddq_s32(t_hi[3], k__DCT_CONST_ROUNDING);
+  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
+  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
+  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
+  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
+  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
+  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
+  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+
+  t_lo[0] = vshrq_n_s32(t_lo[0], DCT_CONST_BITS);
+  t_hi[0] = vshrq_n_s32(t_hi[0], DCT_CONST_BITS);
+  t_lo[1] = vshrq_n_s32(t_lo[1], DCT_CONST_BITS);
+  t_hi[1] = vshrq_n_s32(t_hi[1], DCT_CONST_BITS);
+  t_lo[2] = vshrq_n_s32(t_lo[2], DCT_CONST_BITS);
+  t_hi[2] = vshrq_n_s32(t_hi[2], DCT_CONST_BITS);
+  t_lo[3] = vshrq_n_s32(t_lo[3], DCT_CONST_BITS);
+  t_hi[3] = vshrq_n_s32(t_hi[3], DCT_CONST_BITS);
+  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 2
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  s_lo[4] = t_lo[4];
+  s_hi[4] = t_hi[4];
+  s_lo[5] = t_lo[5];
+  s_hi[5] = t_hi[5];
+  s_lo[6] = t_lo[6];
+  s_hi[6] = t_hi[6];
+  s_lo[7] = t_lo[7];
+  s_hi[7] = t_hi[7];
+  // s8 = x8 * cospi_4_64 + x9 * cospi_28_64;
+  s_lo[8] = vaddq_s32(vmulq_n_s32(t_lo[8], cospi_4_64),
+                      vmulq_n_s32(t_lo[9], cospi_28_64));
+  s_hi[8] = vaddq_s32(vmulq_n_s32(t_hi[8], cospi_4_64),
+                      vmulq_n_s32(t_hi[9], cospi_28_64));
+  // s9 = x8 * cospi_28_64 - x9 * cospi_4_64;
+  s_lo[9] = vsubq_s32(vmulq_n_s32(t_lo[8], cospi_28_64),
+                      vmulq_n_s32(t_lo[9], cospi_4_64));
+  s_hi[9] = vsubq_s32(vmulq_n_s32(t_hi[8], cospi_28_64),
+                      vmulq_n_s32(t_hi[9], cospi_4_64));
+  // s10 = x10 * cospi_20_64 + x11 * cospi_12_64;
+  s_lo[10] = vaddq_s32(vmulq_n_s32(t_lo[10], cospi_20_64),
+                       vmulq_n_s32(t_lo[11], cospi_12_64));
+  s_hi[10] = vaddq_s32(vmulq_n_s32(t_hi[10], cospi_20_64),
+                       vmulq_n_s32(t_hi[11], cospi_12_64));
+  // s11 = x10 * cospi_12_64 - x11 * cospi_20_64;
+  s_lo[11] = vsubq_s32(vmulq_n_s32(t_lo[10], cospi_12_64),
+                       vmulq_n_s32(t_lo[11], cospi_20_64));
+  s_hi[11] = vsubq_s32(vmulq_n_s32(t_hi[10], cospi_12_64),
+                       vmulq_n_s32(t_hi[11], cospi_20_64));
+  // s12 = -x12 * cospi_28_64 + x13 * cospi_4_64;
+  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], -cospi_28_64),
+                       vmulq_n_s32(t_lo[13], cospi_4_64));
+  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], -cospi_28_64),
+                       vmulq_n_s32(t_hi[13], cospi_4_64));
+  // s13 = x12 * cospi_4_64 + x13 * cospi_28_64;
+  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_4_64),
+                       vmulq_n_s32(t_lo[13], cospi_28_64));
+  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_4_64),
+                       vmulq_n_s32(t_hi[13], cospi_28_64));
+  // s14 = -x14 * cospi_12_64 + x15 * cospi_20_64;
+  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_12_64),
+                       vmulq_n_s32(t_lo[15], cospi_20_64));
+  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_12_64),
+                       vmulq_n_s32(t_hi[15], cospi_20_64));
+  // s15 = x14 * cospi_20_64 + x15 * cospi_12_64;
+  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_20_64),
+                       vmulq_n_s32(t_lo[15], cospi_12_64));
+  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_20_64),
+                       vmulq_n_s32(t_hi[15], cospi_12_64));
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[4]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[4]);
+  // s1 + s5
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[5]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[5]);
+  // s2 + s6
+  t_lo[2] = vaddq_s32(s_lo[2], s_lo[6]);
+  t_hi[2] = vaddq_s32(s_hi[2], s_hi[6]);
+  // s3 + s7
+  t_lo[3] = vaddq_s32(s_lo[3], s_lo[7]);
+  t_hi[3] = vaddq_s32(s_hi[3], s_hi[7]);
+  // s0 - s4
+  t_lo[4] = vsubq_s32(s_lo[0], s_lo[4]);
+  t_hi[4] = vsubq_s32(s_hi[0], s_hi[4]);
+  // s1 - s7
+  t_lo[5] = vsubq_s32(s_lo[1], s_lo[5]);
+  t_hi[5] = vsubq_s32(s_hi[1], s_hi[5]);
+  // s2 - s6
+  t_lo[6] = vsubq_s32(s_lo[2], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[2], s_hi[6]);
+  // s3 - s7
+  t_lo[7] = vsubq_s32(s_lo[3], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[3], s_hi[7]);
+  // s8 + s12
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[12]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[13]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[10] = vaddq_s32(s_lo[10], s_lo[14]);
+  t_hi[10] = vaddq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[11] = vaddq_s32(s_lo[11], s_lo[15]);
+  t_hi[11] = vaddq_s32(s_hi[11], s_hi[15]);
+  // s8 + s12
+  t_lo[12] = vsubq_s32(s_lo[8], s_lo[12]);
+  t_hi[12] = vsubq_s32(s_hi[8], s_hi[12]);
+  // s9 + s13
+  t_lo[13] = vsubq_s32(s_lo[9], s_lo[13]);
+  t_hi[13] = vsubq_s32(s_hi[9], s_hi[13]);
+  // s10 + s14
+  t_lo[14] = vsubq_s32(s_lo[10], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[10], s_hi[14]);
+  // s11 + s15
+  t_lo[15] = vsubq_s32(s_lo[11], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[11], s_hi[15]);
+
+  t_lo[8] = vaddq_s32(t_lo[8], k__DCT_CONST_ROUNDING);
+  t_hi[8] = vaddq_s32(t_hi[8], k__DCT_CONST_ROUNDING);
+  t_lo[9] = vaddq_s32(t_lo[9], k__DCT_CONST_ROUNDING);
+  t_hi[9] = vaddq_s32(t_hi[9], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(t_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(t_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(t_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(t_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+  t_lo[8] = vshrq_n_s32(t_lo[8], DCT_CONST_BITS);
+  t_hi[8] = vshrq_n_s32(t_hi[8], DCT_CONST_BITS);
+  t_lo[9] = vshrq_n_s32(t_lo[9], DCT_CONST_BITS);
+  t_hi[9] = vshrq_n_s32(t_hi[9], DCT_CONST_BITS);
+  t_lo[10] = vshrq_n_s32(t_lo[10], DCT_CONST_BITS);
+  t_hi[10] = vshrq_n_s32(t_hi[10], DCT_CONST_BITS);
+  t_lo[11] = vshrq_n_s32(t_lo[11], DCT_CONST_BITS);
+  t_hi[11] = vshrq_n_s32(t_hi[11], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 3
+  s_lo[0] = t_lo[0];
+  s_hi[0] = t_hi[0];
+  s_lo[1] = t_lo[1];
+  s_hi[1] = t_hi[1];
+  s_lo[2] = t_lo[2];
+  s_hi[2] = t_hi[2];
+  s_lo[3] = t_lo[3];
+  s_hi[3] = t_hi[3];
+  // s4 = x4 * cospi_8_64 + x5 * cospi_24_64;
+  s_lo[4] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_8_64),
+                      vmulq_n_s32(t_lo[5], cospi_24_64));
+  s_hi[4] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_8_64),
+                      vmulq_n_s32(t_hi[5], cospi_24_64));
+  // s5 = x4 * cospi_24_64 - x5 * cospi_8_64;
+  s_lo[5] = vaddq_s32(vmulq_n_s32(t_lo[4], cospi_24_64),
+                      vmulq_n_s32(t_lo[5], -cospi_8_64));
+  s_hi[5] = vaddq_s32(vmulq_n_s32(t_hi[4], cospi_24_64),
+                      vmulq_n_s32(t_hi[5], -cospi_8_64));
+  // s6 = -x6 * cospi_24_64 + x7 * cospi_8_64;
+  s_lo[6] = vaddq_s32(vmulq_n_s32(t_lo[6], -cospi_24_64),
+                      vmulq_n_s32(t_lo[7], cospi_8_64));
+  s_hi[6] = vaddq_s32(vmulq_n_s32(t_hi[6], -cospi_24_64),
+                      vmulq_n_s32(t_hi[7], cospi_8_64));
+  // s7 = x6 * cospi_8_64 + x7 * cospi_24_64;
+  s_lo[7] = vaddq_s32(vmulq_n_s32(t_lo[6], cospi_8_64),
+                      vmulq_n_s32(t_lo[7], cospi_24_64));
+  s_hi[7] = vaddq_s32(vmulq_n_s32(t_hi[6], cospi_8_64),
+                      vmulq_n_s32(t_hi[7], cospi_24_64));
+  s_lo[8] = t_lo[8];
+  s_hi[8] = t_hi[8];
+  s_lo[9] = t_lo[9];
+  s_hi[9] = t_hi[9];
+  s_lo[10] = t_lo[10];
+  s_hi[10] = t_hi[10];
+  s_lo[11] = t_lo[11];
+  s_hi[11] = t_hi[11];
+  // s12 = x12 * cospi_8_64 + x13 * cospi_24_64;
+  s_lo[12] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_8_64),
+                       vmulq_n_s32(t_lo[13], cospi_24_64));
+  s_hi[12] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_8_64),
+                       vmulq_n_s32(t_hi[13], cospi_24_64));
+  // s13 = x12 * cospi_24_64 - x13 * cospi_8_64;
+  s_lo[13] = vaddq_s32(vmulq_n_s32(t_lo[12], cospi_24_64),
+                       vmulq_n_s32(t_lo[13], -cospi_8_64));
+  s_hi[13] = vaddq_s32(vmulq_n_s32(t_hi[12], cospi_24_64),
+                       vmulq_n_s32(t_hi[13], -cospi_8_64));
+  // s14 = -x14 * cospi_24_64 + x15 * cospi_8_64;
+  s_lo[14] = vaddq_s32(vmulq_n_s32(t_lo[14], -cospi_24_64),
+                       vmulq_n_s32(t_lo[15], cospi_8_64));
+  s_hi[14] = vaddq_s32(vmulq_n_s32(t_hi[14], -cospi_24_64),
+                       vmulq_n_s32(t_hi[15], cospi_8_64));
+  // s15 = x14 * cospi_8_64 + x15 * cospi_24_64;
+  s_lo[15] = vaddq_s32(vmulq_n_s32(t_lo[14], cospi_8_64),
+                       vmulq_n_s32(t_lo[15], cospi_24_64));
+  s_hi[15] = vaddq_s32(vmulq_n_s32(t_hi[14], cospi_8_64),
+                       vmulq_n_s32(t_hi[15], cospi_24_64));
+
+  // s0 + s4
+  t_lo[0] = vaddq_s32(s_lo[0], s_lo[2]);
+  t_hi[0] = vaddq_s32(s_hi[0], s_hi[2]);
+  // s1 + s3
+  t_lo[1] = vaddq_s32(s_lo[1], s_lo[3]);
+  t_hi[1] = vaddq_s32(s_hi[1], s_hi[3]);
+  // s0 - s4
+  t_lo[2] = vsubq_s32(s_lo[0], s_lo[2]);
+  t_hi[2] = vsubq_s32(s_hi[0], s_hi[2]);
+  // s1 - s3
+  t_lo[3] = vsubq_s32(s_lo[1], s_lo[3]);
+  t_hi[3] = vsubq_s32(s_hi[1], s_hi[3]);
+  // s4 + s6
+  t_lo[4] = vaddq_s32(s_lo[4], s_lo[6]);
+  t_hi[4] = vaddq_s32(s_hi[4], s_hi[6]);
+  // s5 + s7
+  t_lo[5] = vaddq_s32(s_lo[5], s_lo[7]);
+  t_hi[5] = vaddq_s32(s_hi[5], s_hi[7]);
+  // s4 - s6
+  t_lo[6] = vsubq_s32(s_lo[4], s_lo[6]);
+  t_hi[6] = vsubq_s32(s_hi[4], s_hi[6]);
+  // s5 - s7
+  t_lo[7] = vsubq_s32(s_lo[5], s_lo[7]);
+  t_hi[7] = vsubq_s32(s_hi[5], s_hi[7]);
+  // s8 + s10
+  t_lo[8] = vaddq_s32(s_lo[8], s_lo[10]);
+  t_hi[8] = vaddq_s32(s_hi[8], s_hi[10]);
+  // s9 + s11
+  t_lo[9] = vaddq_s32(s_lo[9], s_lo[11]);
+  t_hi[9] = vaddq_s32(s_hi[9], s_hi[11]);
+  // s8 - s10
+  t_lo[10] = vsubq_s32(s_lo[8], s_lo[10]);
+  t_hi[10] = vsubq_s32(s_hi[8], s_hi[10]);
+  // s9 - s11
+  t_lo[11] = vsubq_s32(s_lo[9], s_lo[11]);
+  t_hi[11] = vsubq_s32(s_hi[9], s_hi[11]);
+  // s12 + s14
+  t_lo[12] = vaddq_s32(s_lo[12], s_lo[14]);
+  t_hi[12] = vaddq_s32(s_hi[12], s_hi[14]);
+  // s13 + s15
+  t_lo[13] = vaddq_s32(s_lo[13], s_lo[15]);
+  t_hi[13] = vaddq_s32(s_hi[13], s_hi[15]);
+  // s12 - s14
+  t_lo[14] = vsubq_s32(s_lo[12], s_lo[14]);
+  t_hi[14] = vsubq_s32(s_hi[12], s_hi[14]);
+  // s13 - s15
+  t_lo[15] = vsubq_s32(s_lo[13], s_lo[15]);
+  t_hi[15] = vsubq_s32(s_hi[13], s_hi[15]);
+
+  t_lo[4] = vaddq_s32(t_lo[4], k__DCT_CONST_ROUNDING);
+  t_hi[4] = vaddq_s32(t_hi[4], k__DCT_CONST_ROUNDING);
+  t_lo[5] = vaddq_s32(t_lo[5], k__DCT_CONST_ROUNDING);
+  t_hi[5] = vaddq_s32(t_hi[5], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(t_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(t_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(t_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(t_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[12] = vaddq_s32(t_lo[12], k__DCT_CONST_ROUNDING);
+  t_hi[12] = vaddq_s32(t_hi[12], k__DCT_CONST_ROUNDING);
+  t_lo[13] = vaddq_s32(t_lo[13], k__DCT_CONST_ROUNDING);
+  t_hi[13] = vaddq_s32(t_hi[13], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(t_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(t_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(t_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(t_hi[15], k__DCT_CONST_ROUNDING);
+  t_lo[4] = vshrq_n_s32(t_lo[4], DCT_CONST_BITS);
+  t_hi[4] = vshrq_n_s32(t_hi[4], DCT_CONST_BITS);
+  t_lo[5] = vshrq_n_s32(t_lo[5], DCT_CONST_BITS);
+  t_hi[5] = vshrq_n_s32(t_hi[5], DCT_CONST_BITS);
+  t_lo[6] = vshrq_n_s32(t_lo[6], DCT_CONST_BITS);
+  t_hi[6] = vshrq_n_s32(t_hi[6], DCT_CONST_BITS);
+  t_lo[7] = vshrq_n_s32(t_lo[7], DCT_CONST_BITS);
+  t_hi[7] = vshrq_n_s32(t_hi[7], DCT_CONST_BITS);
+  t_lo[12] = vshrq_n_s32(t_lo[12], DCT_CONST_BITS);
+  t_hi[12] = vshrq_n_s32(t_hi[12], DCT_CONST_BITS);
+  t_lo[13] = vshrq_n_s32(t_lo[13], DCT_CONST_BITS);
+  t_hi[13] = vshrq_n_s32(t_hi[13], DCT_CONST_BITS);
+  t_lo[14] = vshrq_n_s32(t_lo[14], DCT_CONST_BITS);
+  t_hi[14] = vshrq_n_s32(t_hi[14], DCT_CONST_BITS);
+  t_lo[15] = vshrq_n_s32(t_lo[15], DCT_CONST_BITS);
+  t_hi[15] = vshrq_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // stage 4
+  // s2 = (-cospi_16_64) * (x2 + x3);
+  s_lo[2] = vmulq_n_s32(vaddq_s32(t_lo[2], t_lo[3]), -cospi_16_64);
+  s_hi[2] = vmulq_n_s32(vaddq_s32(t_hi[2], t_hi[3]), -cospi_16_64);
+  // s3 = cospi_16_64 * (x2 - x3);
+  s_lo[3] = vmulq_n_s32(vsubq_s32(t_lo[2], t_lo[3]), cospi_16_64);
+  s_hi[3] = vmulq_n_s32(vsubq_s32(t_hi[2], t_hi[3]), cospi_16_64);
+  // s6 = cospi_16_64 * (x6 + x7);
+  s_lo[6] = vmulq_n_s32(vaddq_s32(t_lo[6], t_lo[7]), cospi_16_64);
+  s_hi[6] = vmulq_n_s32(vaddq_s32(t_hi[6], t_hi[7]), cospi_16_64);
+  // s7 = cospi_16_64 * (-x6 + x7);
+  s_lo[7] = vmulq_n_s32(vsubq_s32(t_lo[7], t_lo[6]), cospi_16_64);
+  s_hi[7] = vmulq_n_s32(vsubq_s32(t_hi[7], t_hi[6]), cospi_16_64);
+  // s10 = cospi_16_64 * (x10 + x11);
+  s_lo[10] = vmulq_n_s32(vaddq_s32(t_lo[10], t_lo[11]), cospi_16_64);
+  s_hi[10] = vmulq_n_s32(vaddq_s32(t_hi[10], t_hi[11]), cospi_16_64);
+  // s11 = cospi_16_64 * (-x10 + x11);
+  s_lo[11] = vmulq_n_s32(vsubq_s32(t_lo[11], t_lo[10]), cospi_16_64);
+  s_hi[11] = vmulq_n_s32(vsubq_s32(t_hi[11], t_hi[10]), cospi_16_64);
+  // s14 = (-cospi_16_64) * (x14 + x15);
+  s_lo[14] = vmulq_n_s32(vaddq_s32(t_lo[14], t_lo[15]), -cospi_16_64);
+  s_hi[14] = vmulq_n_s32(vaddq_s32(t_hi[14], t_hi[15]), -cospi_16_64);
+  // s15 = cospi_16_64 * (x14 - x15);
+  s_lo[15] = vmulq_n_s32(vsubq_s32(t_lo[14], t_lo[15]), cospi_16_64);
+  s_hi[15] = vmulq_n_s32(vsubq_s32(t_hi[14], t_hi[15]), cospi_16_64);
+
+  // final fdct_round_shift
+  t_lo[2] = vaddq_s32(s_lo[2], k__DCT_CONST_ROUNDING);
+  t_hi[2] = vaddq_s32(s_hi[2], k__DCT_CONST_ROUNDING);
+  t_lo[3] = vaddq_s32(s_lo[3], k__DCT_CONST_ROUNDING);
+  t_hi[3] = vaddq_s32(s_hi[3], k__DCT_CONST_ROUNDING);
+  t_lo[6] = vaddq_s32(s_lo[6], k__DCT_CONST_ROUNDING);
+  t_hi[6] = vaddq_s32(s_hi[6], k__DCT_CONST_ROUNDING);
+  t_lo[7] = vaddq_s32(s_lo[7], k__DCT_CONST_ROUNDING);
+  t_hi[7] = vaddq_s32(s_hi[7], k__DCT_CONST_ROUNDING);
+  t_lo[10] = vaddq_s32(s_lo[10], k__DCT_CONST_ROUNDING);
+  t_hi[10] = vaddq_s32(s_hi[10], k__DCT_CONST_ROUNDING);
+  t_lo[11] = vaddq_s32(s_lo[11], k__DCT_CONST_ROUNDING);
+  t_hi[11] = vaddq_s32(s_hi[11], k__DCT_CONST_ROUNDING);
+  t_lo[14] = vaddq_s32(s_lo[14], k__DCT_CONST_ROUNDING);
+  t_hi[14] = vaddq_s32(s_hi[14], k__DCT_CONST_ROUNDING);
+  t_lo[15] = vaddq_s32(s_lo[15], k__DCT_CONST_ROUNDING);
+  t_hi[15] = vaddq_s32(s_hi[15], k__DCT_CONST_ROUNDING);
+
+  x_lo[2] = vshrn_n_s32(t_lo[2], DCT_CONST_BITS);
+  x_hi[2] = vshrn_n_s32(t_hi[2], DCT_CONST_BITS);
+  x_lo[3] = vshrn_n_s32(t_lo[3], DCT_CONST_BITS);
+  x_hi[3] = vshrn_n_s32(t_hi[3], DCT_CONST_BITS);
+  x_lo[6] = vshrn_n_s32(t_lo[6], DCT_CONST_BITS);
+  x_hi[6] = vshrn_n_s32(t_hi[6], DCT_CONST_BITS);
+  x_lo[7] = vshrn_n_s32(t_lo[7], DCT_CONST_BITS);
+  x_hi[7] = vshrn_n_s32(t_hi[7], DCT_CONST_BITS);
+  x_lo[10] = vshrn_n_s32(t_lo[10], DCT_CONST_BITS);
+  x_hi[10] = vshrn_n_s32(t_hi[10], DCT_CONST_BITS);
+  x_lo[11] = vshrn_n_s32(t_lo[11], DCT_CONST_BITS);
+  x_hi[11] = vshrn_n_s32(t_hi[11], DCT_CONST_BITS);
+  x_lo[14] = vshrn_n_s32(t_lo[14], DCT_CONST_BITS);
+  x_hi[14] = vshrn_n_s32(t_hi[14], DCT_CONST_BITS);
+  x_lo[15] = vshrn_n_s32(t_lo[15], DCT_CONST_BITS);
+  x_hi[15] = vshrn_n_s32(t_hi[15], DCT_CONST_BITS);
+
+  // x0, x1, x4, x5, x8, x9, x12, x13 narrow down to 16-bits directly
+  x_lo[0] = vmovn_s32(t_lo[0]);
+  x_hi[0] = vmovn_s32(t_hi[0]);
+  x_lo[1] = vmovn_s32(t_lo[1]);
+  x_hi[1] = vmovn_s32(t_hi[1]);
+  x_lo[4] = vmovn_s32(t_lo[4]);
+  x_hi[4] = vmovn_s32(t_hi[4]);
+  x_lo[5] = vmovn_s32(t_lo[5]);
+  x_hi[5] = vmovn_s32(t_hi[5]);
+  x_lo[8] = vmovn_s32(t_lo[8]);
+  x_hi[8] = vmovn_s32(t_hi[8]);
+  x_lo[9] = vmovn_s32(t_lo[9]);
+  x_hi[9] = vmovn_s32(t_hi[9]);
+  x_lo[12] = vmovn_s32(t_lo[12]);
+  x_hi[12] = vmovn_s32(t_hi[12]);
+  x_lo[13] = vmovn_s32(t_lo[13]);
+  x_hi[13] = vmovn_s32(t_hi[13]);
+
+  in[0] = vcombine_s16(x_lo[0], x_hi[0]);
+  in[1] = vnegq_s16(vcombine_s16(x_lo[8], x_hi[8]));
+  in[2] = vcombine_s16(x_lo[12], x_hi[12]);
+  in[3] = vnegq_s16(vcombine_s16(x_lo[4], x_hi[4]));
+  in[4] = vcombine_s16(x_lo[6], x_hi[6]);
+  in[5] = vcombine_s16(x_lo[14], x_hi[14]);
+  in[6] = vcombine_s16(x_lo[10], x_hi[10]);
+  in[7] = vcombine_s16(x_lo[2], x_hi[2]);
+  in[8] = vcombine_s16(x_lo[3], x_hi[3]);
+  in[9] = vcombine_s16(x_lo[11], x_hi[11]);
+  in[10] = vcombine_s16(x_lo[15], x_hi[15]);
+  in[11] = vcombine_s16(x_lo[7], x_hi[7]);
+  in[12] = vcombine_s16(x_lo[5], x_hi[5]);
+  in[13] = vnegq_s16(vcombine_s16(x_lo[13], x_hi[13]));
+  in[14] = vcombine_s16(x_lo[9], x_hi[9]);
+  in[15] = vnegq_s16(vcombine_s16(x_lo[1], x_hi[1]));
+}
+
+static void fdct16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  // Left half.
+  fdct16_8col(in0);
+  // Right half.
+  fdct16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+static void fadst16x16_neon(int16x8_t *in0, int16x8_t *in1) {
+  fadst16_8col(in0);
+  fadst16_8col(in1);
+  transpose_s16_16x16(in0, in1);
+}
+
+void vp9_fht16x16_neon(const int16_t *input, tran_low_t *output, int stride,
+                       int tx_type) {
+  int16x8_t in0[16], in1[16];
+
+  switch (tx_type) {
+    case DCT_DCT: vpx_fdct16x16_neon(input, output, stride); break;
+    case ADST_DCT:
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fdct16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    case DCT_ADST:
+      load_buffer_16x16(input, in0, in1, stride);
+      fdct16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+    default:
+      assert(tx_type == ADST_ADST);
+      load_buffer_16x16(input, in0, in1, stride);
+      fadst16x16_neon(in0, in1);
+      right_shift_16x16(in0, in1);
+      fadst16x16_neon(in0, in1);
+      write_buffer_16x16(output, in0, in1, 16);
+      break;
+  }
+}
diff --git a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
index d75a481..236c317 100644
--- a/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
+++ b/libvpx/vp9/encoder/arm/neon/vp9_quantize_neon.c
@@ -43,11 +43,10 @@
 }
 
 void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   // Quantization pass: All coefficients with index >= zero_flag are
   // skippable. Note: zero_flag can be zero.
   int i;
@@ -59,8 +58,6 @@
   int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // adjust for dc
   v_round = vsetq_lane_s16(round_ptr[0], v_round, 0);
@@ -138,7 +135,7 @@
 }
 
 void vp9_quantize_fp_32x32_neon(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *round_ptr,
+                                const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                                 const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -167,8 +164,6 @@
   uint16x8_t eob_max;
   (void)scan;
   (void)count;
-  (void)skip_block;
-  assert(!skip_block);
 
   // coeff * quant_ptr[]) >> 15
   qcoeff = vqdmulhq_s16(qcoeff, quant);
diff --git a/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c b/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
index 4f88b8f..4d31558 100644
--- a/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
+++ b/libvpx/vp9/encoder/ppc/vp9_quantize_vsx.c
@@ -39,11 +39,10 @@
 }
 
 void vp9_quantize_fp_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *round_ptr,
-                         const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                         uint16_t *eob_ptr, const int16_t *scan,
-                         const int16_t *iscan) {
+                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                         const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                         const int16_t *scan, const int16_t *iscan) {
   int16x8_t qcoeff0, qcoeff1, dqcoeff0, dqcoeff1, eob;
   bool16x8_t zero_coeff0, zero_coeff1;
 
@@ -56,8 +55,6 @@
   int16x8_t scan1 = vec_vsx_ld(16, iscan);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // First set of 8 coeff starts with DC + 7 AC
   qcoeff0 = vec_mulhi(vec_vaddshs(vec_abs(coeff0), round), quant);
@@ -165,7 +162,7 @@
 }
 
 void vp9_quantize_fp_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *round_ptr,
+                               const int16_t *round_ptr,
                                const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                                tran_low_t *dqcoeff_ptr,
                                const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -194,9 +191,7 @@
   int16x8_t abs_coeff1 = vec_abs(coeff1);
 
   (void)scan;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   mask0 = vec_cmpge(abs_coeff0, thres);
   round = vec_sra(vec_add(round, vec_ones_s16), vec_ones_u16);
diff --git a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
index f06fe47..e336179 100644
--- a/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
+++ b/libvpx/vp9/encoder/vp9_aq_cyclicrefresh.c
@@ -497,7 +497,9 @@
        rc->avg_frame_low_motion < thresh_low_motion &&
        rc->frames_since_key > 40) ||
       (!cpi->use_svc && rc->avg_frame_qindex[INTER_FRAME] > qp_max_thresh &&
-       rc->frames_since_key > 20)) {
+       rc->frames_since_key > 20) ||
+      (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+       rc->frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)) {
     cr->apply_cyclic_refresh = 0;
     return;
   }
diff --git a/libvpx/vp9/encoder/vp9_bitstream.c b/libvpx/vp9/encoder/vp9_bitstream.c
index 3eff4ce..75bd097 100644
--- a/libvpx/vp9/encoder/vp9_bitstream.c
+++ b/libvpx/vp9/encoder/vp9_bitstream.c
@@ -236,11 +236,11 @@
   }
 }
 
-static void pack_inter_mode_mvs(
-    VP9_COMP *cpi, const MACROBLOCKD *const xd,
-    const MB_MODE_INFO_EXT *const mbmi_ext, vpx_writer *w,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void pack_inter_mode_mvs(VP9_COMP *cpi, const MACROBLOCKD *const xd,
+                                const MB_MODE_INFO_EXT *const mbmi_ext,
+                                vpx_writer *w,
+                                unsigned int *const max_mv_magnitude,
+                                int interp_filter_selected[][SWITCHABLE]) {
   VP9_COMMON *const cm = &cpi->common;
   const nmv_context *nmvc = &cm->fc->nmvc;
   const struct segmentation *const seg = &cm->seg;
@@ -373,11 +373,12 @@
   write_intra_mode(w, mi->uv_mode, vp9_kf_uv_mode_prob[mi->mode]);
 }
 
-static void write_modes_b(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_b(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                          const TileInfo *const tile, vpx_writer *w,
+                          TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                          int mi_row, int mi_col,
+                          unsigned int *const max_mv_magnitude,
+                          int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const MB_MODE_INFO_EXT *const mbmi_ext =
       cpi->td.mb.mbmi_ext_base + (mi_row * cm->mi_cols + mi_col);
@@ -422,12 +423,12 @@
   }
 }
 
-static void write_modes_sb(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
-    int mi_row, int mi_col, BLOCK_SIZE bsize,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes_sb(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                           const TileInfo *const tile, vpx_writer *w,
+                           TOKENEXTRA **tok, const TOKENEXTRA *const tok_end,
+                           int mi_row, int mi_col, BLOCK_SIZE bsize,
+                           unsigned int *const max_mv_magnitude,
+                           int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   const int bsl = b_width_log2_lookup[bsize];
   const int bs = (1 << bsl) / 4;
@@ -485,11 +486,10 @@
     update_partition_context(xd, mi_row, mi_col, subsize, bsize);
 }
 
-static void write_modes(
-    VP9_COMP *cpi, MACROBLOCKD *const xd, const TileInfo *const tile,
-    vpx_writer *w, int tile_row, int tile_col,
-    unsigned int *const max_mv_magnitude,
-    int interp_filter_selected[MAX_REF_FRAMES][SWITCHABLE]) {
+static void write_modes(VP9_COMP *cpi, MACROBLOCKD *const xd,
+                        const TileInfo *const tile, vpx_writer *w, int tile_row,
+                        int tile_col, unsigned int *const max_mv_magnitude,
+                        int interp_filter_selected[][SWITCHABLE]) {
   const VP9_COMMON *const cm = &cpi->common;
   int mi_row, mi_col, tile_sb_row;
   TOKENEXTRA *tok = NULL;
@@ -554,7 +554,7 @@
   switch (cpi->sf.use_fast_coef_updates) {
     case TWO_LOOP: {
       /* dry run to see if there is any update at all needed */
-      int savings = 0;
+      int64_t savings = 0;
       int update[2] = { 0, 0 };
       for (i = 0; i < PLANE_TYPES; ++i) {
         for (j = 0; j < REF_TYPES; ++j) {
@@ -563,7 +563,7 @@
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 const vpx_prob oldp = old_coef_probs[i][j][k][l][t];
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -600,7 +600,7 @@
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
                 const vpx_prob upd = DIFF_UPDATE_PROB;
-                int s;
+                int64_t s;
                 int u = 0;
                 if (t == PIVOT_NODE)
                   s = vp9_prob_diff_update_savings_search_model(
@@ -636,7 +636,7 @@
               for (t = 0; t < entropy_nodes_update; ++t) {
                 vpx_prob newp = new_coef_probs[i][j][k][l][t];
                 vpx_prob *oldp = old_coef_probs[i][j][k][l] + t;
-                int s;
+                int64_t s;
                 int u = 0;
 
                 if (t == PIVOT_NODE) {
@@ -963,21 +963,20 @@
   }
 }
 
-static int encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+static void encode_tiles_buffer_alloc(VP9_COMP *const cpi) {
+  VP9_COMMON *const cm = &cpi->common;
   int i;
   const size_t worker_data_size =
       cpi->num_workers * sizeof(*cpi->vp9_bitstream_worker_data);
-  cpi->vp9_bitstream_worker_data = vpx_memalign(16, worker_data_size);
+  CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data,
+                  vpx_memalign(16, worker_data_size));
   memset(cpi->vp9_bitstream_worker_data, 0, worker_data_size);
-  if (!cpi->vp9_bitstream_worker_data) return 1;
   for (i = 1; i < cpi->num_workers; ++i) {
     cpi->vp9_bitstream_worker_data[i].dest_size =
         cpi->oxcf.width * cpi->oxcf.height;
-    cpi->vp9_bitstream_worker_data[i].dest =
-        vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size);
-    if (!cpi->vp9_bitstream_worker_data[i].dest) return 1;
+    CHECK_MEM_ERROR(cm, cpi->vp9_bitstream_worker_data[i].dest,
+                    vpx_malloc(cpi->vp9_bitstream_worker_data[i].dest_size));
   }
-  return 0;
 }
 
 static size_t encode_tiles_mt(VP9_COMP *cpi, uint8_t *data_ptr) {
@@ -992,7 +991,7 @@
       cpi->vp9_bitstream_worker_data[1].dest_size >
           (cpi->oxcf.width * cpi->oxcf.height)) {
     vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-    if (encode_tiles_buffer_alloc(cpi)) return 0;
+    encode_tiles_buffer_alloc(cpi);
   }
 
   while (tile_col < tile_cols) {
diff --git a/libvpx/vp9/encoder/vp9_cost.h b/libvpx/vp9/encoder/vp9_cost.h
index 638d72a..ee0033f 100644
--- a/libvpx/vp9/encoder/vp9_cost.h
+++ b/libvpx/vp9/encoder/vp9_cost.h
@@ -29,9 +29,8 @@
 
 #define vp9_cost_bit(prob, bit) vp9_cost_zero((bit) ? 256 - (prob) : (prob))
 
-static INLINE unsigned int cost_branch256(const unsigned int ct[2],
-                                          vpx_prob p) {
-  return ct[0] * vp9_cost_zero(p) + ct[1] * vp9_cost_one(p);
+static INLINE uint64_t cost_branch256(const unsigned int ct[2], vpx_prob p) {
+  return (uint64_t)ct[0] * vp9_cost_zero(p) + (uint64_t)ct[1] * vp9_cost_one(p);
 }
 
 static INLINE int treed_cost(vpx_tree tree, const vpx_prob *probs, int bits,
diff --git a/libvpx/vp9/encoder/vp9_encodeframe.c b/libvpx/vp9/encoder/vp9_encodeframe.c
index 131c488..a9f392b 100644
--- a/libvpx/vp9/encoder/vp9_encodeframe.c
+++ b/libvpx/vp9/encoder/vp9_encodeframe.c
@@ -217,8 +217,8 @@
       break;
   }
 
-  // Set segment index from ROI map if it's enabled.
-  if (cpi->roi.enabled)
+  // Set segment index if ROI map or active_map is enabled.
+  if (cpi->roi.enabled || cpi->active_map.enabled)
     mi->segment_id = get_segment_id(cm, map, bsize, mi_row, mi_col);
 
   vp9_init_plane_quantizers(cpi, x);
@@ -1905,13 +1905,17 @@
 }
 
 static void set_mode_info_seg_skip(MACROBLOCK *x, TX_MODE tx_mode,
+                                   INTERP_FILTER interp_filter,
                                    RD_COST *rd_cost, BLOCK_SIZE bsize) {
   MACROBLOCKD *const xd = &x->e_mbd;
   MODE_INFO *const mi = xd->mi[0];
   INTERP_FILTER filter_ref;
 
   filter_ref = get_pred_context_switchable_interp(xd);
-  if (filter_ref == SWITCHABLE_FILTERS) filter_ref = EIGHTTAP;
+  if (interp_filter == BILINEAR)
+    filter_ref = BILINEAR;
+  else if (filter_ref == SWITCHABLE_FILTERS)
+    filter_ref = EIGHTTAP;
 
   mi->sb_type = bsize;
   mi->mode = ZEROMV;
@@ -2495,7 +2499,8 @@
   *(xd->mi[0]) = ctx->mic;
   *(x->mbmi_ext) = ctx->mbmi_ext;
 
-  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled)) {
+  if (seg->enabled && (cpi->oxcf.aq_mode != NO_AQ || cpi->roi.enabled ||
+                       cpi->active_map.enabled)) {
     // Setting segmentation map for cyclic_refresh.
     if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ &&
         cpi->cyclic_refresh->content_mode) {
@@ -4682,7 +4687,7 @@
     hybrid_search_svc_baseiskey(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
                                 mi_col);
   else if (segfeature_active(&cm->seg, mi->segment_id, SEG_LVL_SKIP))
-    set_mode_info_seg_skip(x, cm->tx_mode, rd_cost, bsize);
+    set_mode_info_seg_skip(x, cm->tx_mode, cm->interp_filter, rd_cost, bsize);
   else if (bsize >= BLOCK_8X8) {
     if (cpi->rc.hybrid_intra_scene_change)
       hybrid_search_scene_change(cpi, x, rd_cost, bsize, ctx, tile_data, mi_row,
@@ -5513,16 +5518,6 @@
     x->arf_frame_usage = 0;
     x->lastgolden_frame_usage = 0;
 
-    if (seg->enabled) {
-      const uint8_t *const map =
-          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
-      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
-      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
-      if (seg_skip) {
-        partition_search_type = FIXED_PARTITION;
-      }
-    }
-
     if (cpi->compute_source_sad_onepass && cpi->sf.use_source_sad) {
       int shift = cpi->Source->y_stride * (mi_row << 3) + (mi_col << 3);
       int sb_offset2 = ((cm->mi_cols + 7) >> 3) * (mi_row >> 3) + (mi_col >> 3);
@@ -5534,6 +5529,38 @@
         partition_search_type = REFERENCE_PARTITION;
     }
 
+    if (seg->enabled) {
+      const uint8_t *const map =
+          seg->update_map ? cpi->segmentation_map : cm->last_frame_seg_map;
+      int segment_id = get_segment_id(cm, map, BLOCK_64X64, mi_row, mi_col);
+      seg_skip = segfeature_active(seg, segment_id, SEG_LVL_SKIP);
+
+      if (cpi->roi.enabled && cpi->roi.skip[BACKGROUND_SEG_SKIP_ID] &&
+          cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY &&
+          x->content_state_sb > kLowSadLowSumdiff) {
+        // For ROI with skip, force segment = 0 (no skip) over whole
+        // superblock to avoid artifacts if temporal change in source_sad is
+        // not 0.
+        int xi, yi;
+        const int bw = num_8x8_blocks_wide_lookup[BLOCK_64X64];
+        const int bh = num_8x8_blocks_high_lookup[BLOCK_64X64];
+        const int xmis = VPXMIN(cm->mi_cols - mi_col, bw);
+        const int ymis = VPXMIN(cm->mi_rows - mi_row, bh);
+        const int block_index = mi_row * cm->mi_cols + mi_col;
+        set_mode_info_offsets(cm, x, xd, mi_row, mi_col);
+        for (yi = 0; yi < ymis; yi++)
+          for (xi = 0; xi < xmis; xi++) {
+            int map_offset = block_index + yi * cm->mi_cols + xi;
+            cpi->segmentation_map[map_offset] = 0;
+          }
+        set_segment_index(cpi, x, mi_row, mi_col, BLOCK_64X64, 0);
+        seg_skip = 0;
+      }
+      if (seg_skip) {
+        partition_search_type = FIXED_PARTITION;
+      }
+    }
+
     // Set the partition type of the 64X64 block
     switch (partition_search_type) {
       case VAR_BASED_PARTITION:
@@ -5829,9 +5856,12 @@
   get_start_tok(cpi, tile_row, tile_col, mi_row, &tok);
   cpi->tplist[tile_row][tile_col][tile_sb_row].start = tok;
 
+#if CONFIG_REALTIME_ONLY
+  assert(cpi->sf.use_nonrd_pick_mode);
+  encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
+#else
   if (cpi->sf.use_nonrd_pick_mode)
     encode_nonrd_sb_row(cpi, td, this_tile, mi_row, &tok);
-#if !CONFIG_REALTIME_ONLY
   else
     encode_rd_sb_row(cpi, td, this_tile, mi_row, &tok);
 #endif
diff --git a/libvpx/vp9/encoder/vp9_encodemb.c b/libvpx/vp9/encoder/vp9_encodemb.c
index 7630a81..fa222f9 100644
--- a/libvpx/vp9/encoder/vp9_encodemb.c
+++ b/libvpx/vp9/encoder/vp9_encodemb.c
@@ -366,28 +366,28 @@
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                                     p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                     eob, scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp,
+                                     qcoeff, dqcoeff, pd->dequant, eob,
+                                     scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 256, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 64, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vp9_highbd_quantize_fp(coeff, 16, x->skip_block, p->round_fp,
-                               p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                               scan_order->scan, scan_order->iscan);
+        vp9_highbd_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+                               dqcoeff, pd->dequant, eob, scan_order->scan,
+                               scan_order->iscan);
         break;
     }
     return;
@@ -397,29 +397,26 @@
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vp9_quantize_fp_32x32(coeff, 1024, x->skip_block, p->round_fp,
-                            p->quant_fp, qcoeff, dqcoeff, pd->dequant, eob,
-                            scan_order->scan, scan_order->iscan);
+      vp9_quantize_fp_32x32(coeff, 1024, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
+                            scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
 
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                      qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
-                      scan_order->iscan);
+      vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff, dqcoeff,
+                      pd->dequant, eob, scan_order->scan, scan_order->iscan);
       break;
   }
 }
@@ -444,28 +441,24 @@
     switch (tx_size) {
       case TX_32X32:
         vpx_highbd_fdct32x32_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc_32x32(coeff, x->skip_block, p->round,
-                                     p->quant_fp[0], qcoeff, dqcoeff,
-                                     pd->dequant[0], eob);
+        vpx_highbd_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff,
+                                     dqcoeff, pd->dequant[0], eob);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 256, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8_1(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 64, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_dc(coeff, 16, x->skip_block, p->round,
-                               p->quant_fp[0], qcoeff, dqcoeff, pd->dequant[0],
-                               eob);
+        vpx_highbd_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff,
+                               dqcoeff, pd->dequant[0], eob);
         break;
     }
     return;
@@ -475,24 +468,24 @@
   switch (tx_size) {
     case TX_32X32:
       vpx_fdct32x32_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc_32x32(coeff, x->skip_block, p->round, p->quant_fp[0],
-                            qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc_32x32(coeff, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                            pd->dequant[0], eob);
       break;
     case TX_16X16:
       vpx_fdct16x16_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 256, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 256, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     case TX_8X8:
       vpx_fdct8x8_1(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 64, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 64, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_dc(coeff, 16, x->skip_block, p->round, p->quant_fp[0],
-                      qcoeff, dqcoeff, pd->dequant[0], eob);
+      vpx_quantize_dc(coeff, 16, p->round, p->quant_fp[0], qcoeff, dqcoeff,
+                      pd->dequant[0], eob);
       break;
   }
 }
@@ -518,32 +511,28 @@
     switch (tx_size) {
       case TX_32X32:
         highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                    p->round, p->quant, p->quant_shift, qcoeff,
-                                    dqcoeff, pd->dequant, eob, scan_order->scan,
-                                    scan_order->iscan);
+        vpx_highbd_quantize_b_32x32(
+            coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+            dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         break;
       case TX_16X16:
         vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       case TX_8X8:
         vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
       default:
         assert(tx_size == TX_4X4);
         x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                              p->quant, p->quant_shift, qcoeff, dqcoeff,
-                              pd->dequant, eob, scan_order->scan,
-                              scan_order->iscan);
+        vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                              p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                              scan_order->scan, scan_order->iscan);
         break;
     }
     return;
@@ -553,29 +542,28 @@
   switch (tx_size) {
     case TX_32X32:
       fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-      vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                           p->quant, p->quant_shift, qcoeff, dqcoeff,
-                           pd->dequant, eob, scan_order->scan,
-                           scan_order->iscan);
+      vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                           p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                           scan_order->scan, scan_order->iscan);
       break;
     case TX_16X16:
       vpx_fdct16x16(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
     case TX_8X8:
       vpx_fdct8x8(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
     default:
       assert(tx_size == TX_4X4);
       x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-      vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                     p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                     scan_order->scan, scan_order->iscan);
+      vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                     qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                     scan_order->iscan);
       break;
   }
 }
@@ -869,10 +857,9 @@
           vpx_highbd_subtract_block(32, 32, src_diff, diff_stride, src,
                                     src_stride, dst, dst_stride, xd->bd);
           highbd_fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin,
-                                      p->round, p->quant, p->quant_shift,
-                                      qcoeff, dqcoeff, pd->dequant, eob,
-                                      scan_order->scan, scan_order->iscan);
+          vpx_highbd_quantize_b_32x32(
+              coeff, 1024, p->zbin, p->round, p->quant, p->quant_shift, qcoeff,
+              dqcoeff, pd->dequant, eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -889,10 +876,9 @@
             vpx_highbd_fdct16x16(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht16x16(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 256, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -910,10 +896,9 @@
             vpx_highbd_fdct8x8(src_diff, coeff, diff_stride);
           else
             vp9_highbd_fht8x8(src_diff, coeff, diff_stride, tx_type);
-          vpx_highbd_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 64, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -932,10 +917,9 @@
             vp9_highbd_fht4x4(src_diff, coeff, diff_stride, tx_type);
           else
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-          vpx_highbd_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round,
-                                p->quant, p->quant_shift, qcoeff, dqcoeff,
-                                pd->dequant, eob, scan_order->scan,
-                                scan_order->iscan);
+          vpx_highbd_quantize_b(coeff, 16, p->zbin, p->round, p->quant,
+                                p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                                eob, scan_order->scan, scan_order->iscan);
         }
         if (args->enable_coeff_opt && !x->skip_recode) {
           *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -964,10 +948,9 @@
         vpx_subtract_block(32, 32, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         fdct32x32(x->use_lp32x32fdct, src_diff, coeff, diff_stride);
-        vpx_quantize_b_32x32(coeff, 1024, x->skip_block, p->zbin, p->round,
-                             p->quant, p->quant_shift, qcoeff, dqcoeff,
-                             pd->dequant, eob, scan_order->scan,
-                             scan_order->iscan);
+        vpx_quantize_b_32x32(coeff, 1024, p->zbin, p->round, p->quant,
+                             p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
+                             scan_order->scan, scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -980,9 +963,9 @@
         vpx_subtract_block(16, 16, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         vp9_fht16x16(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 256, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 256, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -995,9 +978,9 @@
         vpx_subtract_block(8, 8, src_diff, diff_stride, src, src_stride, dst,
                            dst_stride);
         vp9_fht8x8(src_diff, coeff, diff_stride, tx_type);
-        vpx_quantize_b(coeff, 64, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 64, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
@@ -1014,9 +997,9 @@
           vp9_fht4x4(src_diff, coeff, diff_stride, tx_type);
         else
           x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-        vpx_quantize_b(coeff, 16, x->skip_block, p->zbin, p->round, p->quant,
-                       p->quant_shift, qcoeff, dqcoeff, pd->dequant, eob,
-                       scan_order->scan, scan_order->iscan);
+        vpx_quantize_b(coeff, 16, p->zbin, p->round, p->quant, p->quant_shift,
+                       qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+                       scan_order->iscan);
       }
       if (args->enable_coeff_opt && !x->skip_recode) {
         *a = *l = vp9_optimize_b(x, plane, block, tx_size, entropy_ctx) > 0;
diff --git a/libvpx/vp9/encoder/vp9_encoder.c b/libvpx/vp9/encoder/vp9_encoder.c
index 7e80835..d3f4d1e 100644
--- a/libvpx/vp9/encoder/vp9_encoder.c
+++ b/libvpx/vp9/encoder/vp9_encoder.c
@@ -25,6 +25,7 @@
 #endif
 #include "vpx_ports/mem.h"
 #include "vpx_ports/system_state.h"
+#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/vpx_timer.h"
 #if CONFIG_BITSTREAM_DEBUG || CONFIG_MISMATCH_DEBUG
 #include "vpx_util/vpx_debug_util.h"
@@ -585,8 +586,6 @@
   int ref_frame[8];
   int internal_delta_q[MAX_SEGMENTS];
   int i;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
 
   // TODO(jianj): Investigate why ROI not working in speed < 5 or in non
   // realtime mode.
@@ -618,7 +617,7 @@
     }
     if (skip[i] != 0) {
       vp9_enable_segfeature(seg, i, SEG_LVL_SKIP);
-      vp9_set_segdata(seg, i, SEG_LVL_SKIP, skip[i]);
+      vp9_set_segdata(seg, i, SEG_LVL_SKIP, 0);
     }
     if (ref_frame[i] >= 0) {
       int valid_ref = 1;
@@ -627,7 +626,7 @@
         valid_ref = 0;
       // If GOLDEN is selected, make sure it's set as reference.
       if (ref_frame[i] == GOLDEN_FRAME &&
-          !(cpi->ref_frame_flags & flag_list[ref_frame[i]])) {
+          !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame[i]))) {
         valid_ref = 0;
       }
       // GOLDEN was updated in previous encoded frame, so GOLDEN and LAST are
@@ -929,24 +928,21 @@
   cm->prev_mi_grid_visible = cm->prev_mi_grid_base + cm->mi_stride + 1;
 }
 
-void vp9_initialize_enc(void) {
-  static volatile int init_done = 0;
-
-  if (!init_done) {
-    vp9_rtcd();
-    vpx_dsp_rtcd();
-    vpx_scale_rtcd();
-    vp9_init_intra_predictors();
-    vp9_init_me_luts();
-    vp9_rc_init_minq_luts();
-    vp9_entropy_mv_init();
+static void initialize_enc(void) {
+  vp9_rtcd();
+  vpx_dsp_rtcd();
+  vpx_scale_rtcd();
+  vp9_init_intra_predictors();
+  vp9_init_me_luts();
+  vp9_rc_init_minq_luts();
+  vp9_entropy_mv_init();
 #if !CONFIG_REALTIME_ONLY
-    vp9_temporal_filter_init();
+  vp9_temporal_filter_init();
 #endif
-    init_done = 1;
-  }
 }
 
+void vp9_initialize_enc(void) { once(initialize_enc); }
+
 static void dealloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   int i;
@@ -1383,21 +1379,22 @@
 #endif
 }
 
-static int alloc_context_buffers_ext(VP9_COMP *cpi) {
+static void alloc_context_buffers_ext(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int mi_size = cm->mi_cols * cm->mi_rows;
 
-  cpi->mbmi_ext_base = vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base));
-  if (!cpi->mbmi_ext_base) return 1;
-
-  return 0;
+  CHECK_MEM_ERROR(cm, cpi->mbmi_ext_base,
+                  vpx_calloc(mi_size, sizeof(*cpi->mbmi_ext_base)));
 }
 
 static void alloc_compressor_data(VP9_COMP *cpi) {
   VP9_COMMON *cm = &cpi->common;
   int sb_rows;
 
-  vp9_alloc_context_buffers(cm, cm->width, cm->height);
+  if (vp9_alloc_context_buffers(cm, cm->width, cm->height)) {
+    vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                       "Failed to allocate context buffers");
+  }
 
   alloc_context_buffers_ext(cpi);
 
@@ -1573,15 +1570,13 @@
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-// TODO(angiebird): make sdx8f available for highbitdepth if needed
 #define HIGHBD_BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
   cpi->fn_ptr[BT].sdf = SDF;                             \
   cpi->fn_ptr[BT].sdaf = SDAF;                           \
   cpi->fn_ptr[BT].vf = VF;                               \
   cpi->fn_ptr[BT].svf = SVF;                             \
   cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = NULL;
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
 #define MAKE_BFP_SAD_WRAPPER(fnname)                                           \
   static unsigned int fnname##_bits8(const uint8_t *src_ptr,                   \
@@ -2062,7 +2057,10 @@
       cpi->external_resize = 0;
     } else if (cm->mi_alloc_size == new_mi_size &&
                (cpi->oxcf.width > last_w || cpi->oxcf.height > last_h)) {
-      vp9_alloc_loop_filter(cm);
+      if (vp9_alloc_loop_filter(cm)) {
+        vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
+                           "Failed to allocate loop filter data");
+      }
     }
   }
 
@@ -2223,6 +2221,7 @@
   switch (fmt) {
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I422:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I42216: *subsampling_x = 1; break;
@@ -2233,6 +2232,7 @@
     case VPX_IMG_FMT_I420:
     case VPX_IMG_FMT_I440:
     case VPX_IMG_FMT_YV12:
+    case VPX_IMG_FMT_NV12:
     case VPX_IMG_FMT_I42016:
     case VPX_IMG_FMT_I44016: *subsampling_y = 1; break;
     default: *subsampling_y = 0; break;
@@ -2563,67 +2563,61 @@
   CHECK_MEM_ERROR(cm, cpi->source_diff_var, vpx_calloc(cm->MBs, sizeof(diff)));
   cpi->source_var_thresh = 0;
   cpi->frames_till_next_var_check = 0;
-#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF, SDX8F) \
-  cpi->fn_ptr[BT].sdf = SDF;                             \
-  cpi->fn_ptr[BT].sdaf = SDAF;                           \
-  cpi->fn_ptr[BT].vf = VF;                               \
-  cpi->fn_ptr[BT].svf = SVF;                             \
-  cpi->fn_ptr[BT].svaf = SVAF;                           \
-  cpi->fn_ptr[BT].sdx4df = SDX4DF;                       \
-  cpi->fn_ptr[BT].sdx8f = SDX8F;
+#define BFP(BT, SDF, SDAF, VF, SVF, SVAF, SDX4DF) \
+  cpi->fn_ptr[BT].sdf = SDF;                      \
+  cpi->fn_ptr[BT].sdaf = SDAF;                    \
+  cpi->fn_ptr[BT].vf = VF;                        \
+  cpi->fn_ptr[BT].svf = SVF;                      \
+  cpi->fn_ptr[BT].svaf = SVAF;                    \
+  cpi->fn_ptr[BT].sdx4df = SDX4DF;
 
-  // TODO(angiebird): make sdx8f available for every block size
   BFP(BLOCK_32X16, vpx_sad32x16, vpx_sad32x16_avg, vpx_variance32x16,
       vpx_sub_pixel_variance32x16, vpx_sub_pixel_avg_variance32x16,
-      vpx_sad32x16x4d, NULL)
+      vpx_sad32x16x4d)
 
   BFP(BLOCK_16X32, vpx_sad16x32, vpx_sad16x32_avg, vpx_variance16x32,
       vpx_sub_pixel_variance16x32, vpx_sub_pixel_avg_variance16x32,
-      vpx_sad16x32x4d, NULL)
+      vpx_sad16x32x4d)
 
   BFP(BLOCK_64X32, vpx_sad64x32, vpx_sad64x32_avg, vpx_variance64x32,
       vpx_sub_pixel_variance64x32, vpx_sub_pixel_avg_variance64x32,
-      vpx_sad64x32x4d, NULL)
+      vpx_sad64x32x4d)
 
   BFP(BLOCK_32X64, vpx_sad32x64, vpx_sad32x64_avg, vpx_variance32x64,
       vpx_sub_pixel_variance32x64, vpx_sub_pixel_avg_variance32x64,
-      vpx_sad32x64x4d, NULL)
+      vpx_sad32x64x4d)
 
   BFP(BLOCK_32X32, vpx_sad32x32, vpx_sad32x32_avg, vpx_variance32x32,
       vpx_sub_pixel_variance32x32, vpx_sub_pixel_avg_variance32x32,
-      vpx_sad32x32x4d, vpx_sad32x32x8)
+      vpx_sad32x32x4d)
 
   BFP(BLOCK_64X64, vpx_sad64x64, vpx_sad64x64_avg, vpx_variance64x64,
       vpx_sub_pixel_variance64x64, vpx_sub_pixel_avg_variance64x64,
-      vpx_sad64x64x4d, NULL)
+      vpx_sad64x64x4d)
 
   BFP(BLOCK_16X16, vpx_sad16x16, vpx_sad16x16_avg, vpx_variance16x16,
       vpx_sub_pixel_variance16x16, vpx_sub_pixel_avg_variance16x16,
-      vpx_sad16x16x4d, vpx_sad16x16x8)
+      vpx_sad16x16x4d)
 
   BFP(BLOCK_16X8, vpx_sad16x8, vpx_sad16x8_avg, vpx_variance16x8,
       vpx_sub_pixel_variance16x8, vpx_sub_pixel_avg_variance16x8,
-      vpx_sad16x8x4d, vpx_sad16x8x8)
+      vpx_sad16x8x4d)
 
   BFP(BLOCK_8X16, vpx_sad8x16, vpx_sad8x16_avg, vpx_variance8x16,
       vpx_sub_pixel_variance8x16, vpx_sub_pixel_avg_variance8x16,
-      vpx_sad8x16x4d, vpx_sad8x16x8)
+      vpx_sad8x16x4d)
 
   BFP(BLOCK_8X8, vpx_sad8x8, vpx_sad8x8_avg, vpx_variance8x8,
-      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d,
-      vpx_sad8x8x8)
+      vpx_sub_pixel_variance8x8, vpx_sub_pixel_avg_variance8x8, vpx_sad8x8x4d)
 
   BFP(BLOCK_8X4, vpx_sad8x4, vpx_sad8x4_avg, vpx_variance8x4,
-      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d,
-      NULL)
+      vpx_sub_pixel_variance8x4, vpx_sub_pixel_avg_variance8x4, vpx_sad8x4x4d)
 
   BFP(BLOCK_4X8, vpx_sad4x8, vpx_sad4x8_avg, vpx_variance4x8,
-      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d,
-      NULL)
+      vpx_sub_pixel_variance4x8, vpx_sub_pixel_avg_variance4x8, vpx_sad4x8x4d)
 
   BFP(BLOCK_4X4, vpx_sad4x4, vpx_sad4x4_avg, vpx_variance4x4,
-      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d,
-      vpx_sad4x4x8)
+      vpx_sub_pixel_variance4x4, vpx_sub_pixel_avg_variance4x4, vpx_sad4x4x4d)
 
 #if CONFIG_VP9_HIGHBITDEPTH
   highbd_set_var_fns(cpi);
@@ -2676,7 +2670,6 @@
 void vp9_remove_compressor(VP9_COMP *cpi) {
   VP9_COMMON *cm;
   unsigned int i;
-  int t;
 
   if (!cpi) return;
 
@@ -2789,28 +2782,10 @@
 
   free_tpl_buffer(cpi);
 
-  for (t = 0; t < cpi->num_workers; ++t) {
-    VPxWorker *const worker = &cpi->workers[t];
-    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
-
-    // Deallocate allocated threads.
-    vpx_get_worker_interface()->end(worker);
-
-    // Deallocate allocated thread data.
-    if (t < cpi->num_workers - 1) {
-      vpx_free(thread_data->td->counts);
-      vp9_free_pc_tree(thread_data->td);
-      vpx_free(thread_data->td);
-    }
-  }
-  vpx_free(cpi->tile_thr_data);
-  vpx_free(cpi->workers);
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
   vp9_row_mt_mem_dealloc(cpi);
-
-  if (cpi->num_workers > 1) {
-    vp9_loop_filter_dealloc(&cpi->lf_row_sync);
-    vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
-  }
+  vp9_encode_free_mt_data(cpi);
 
 #if !CONFIG_REALTIME_ONLY
   vp9_alt_ref_aq_destroy(cpi->alt_ref_aq);
@@ -3712,9 +3687,9 @@
       case 6: l = 150; break;
     }
     if (!cpi->common.postproc_state.limits) {
-      cpi->common.postproc_state.limits =
-          vpx_calloc(cpi->un_scaled_source->y_width,
-                     sizeof(*cpi->common.postproc_state.limits));
+      CHECK_MEM_ERROR(cm, cpi->common.postproc_state.limits,
+                      vpx_calloc(cpi->un_scaled_source->y_width,
+                                 sizeof(*cpi->common.postproc_state.limits)));
     }
     vp9_denoise(&cpi->common, cpi->Source, cpi->Source, l,
                 cpi->common.postproc_state.limits);
@@ -4137,11 +4112,22 @@
     vp9_alt_ref_aq_setup_map(cpi->alt_ref_aq, cpi);
   } else {
 #endif
-    if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
+    // If ROI is enabled and skip feature is used for segmentation, apply cyclic
+    // refresh but not apply ROI for skip for the first 20 frames (defined by
+    // FRAMES_NO_SKIPPING_AFTER_KEY) after key frame to improve quality.
+    if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
+      if (cpi->roi.skip[BACKGROUND_SEG_SKIP_ID]) {
+        if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
+          vp9_cyclic_refresh_setup(cpi);
+        if (cpi->rc.frames_since_key > FRAMES_NO_SKIPPING_AFTER_KEY)
+          apply_roi_map(cpi);
+      } else {
+        apply_roi_map(cpi);
+      }
+    } else if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ) {
       vp9_cyclic_refresh_setup(cpi);
-    } else if (cpi->roi.enabled && !frame_is_intra_only(cm)) {
-      apply_roi_map(cpi);
     }
+
 #if !CONFIG_REALTIME_ONLY
   }
 #endif
@@ -6630,19 +6616,22 @@
   int pix_num = 1 << num_pels_log2_lookup[txsize_to_bsize[tx_size]];
   const int shift = tx_size == TX_32X32 ? 0 : 2;
 
+  // skip block condition should be handled before this is called.
+  assert(!x->skip_block);
+
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vp9_highbd_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                                 p->quant_fp, qcoeff, dqcoeff, pd->dequant,
-                                 &eob, scan_order->scan, scan_order->iscan);
+    vp9_highbd_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp,
+                                 qcoeff, dqcoeff, pd->dequant, &eob,
+                                 scan_order->scan, scan_order->iscan);
   } else {
-    vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp,
-                          p->quant_fp, qcoeff, dqcoeff, pd->dequant, &eob,
-                          scan_order->scan, scan_order->iscan);
+    vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                          dqcoeff, pd->dequant, &eob, scan_order->scan,
+                          scan_order->iscan);
   }
 #else
-  vp9_quantize_fp_32x32(coeff, pix_num, x->skip_block, p->round_fp, p->quant_fp,
-                        qcoeff, dqcoeff, pd->dequant, &eob, scan_order->scan,
+  vp9_quantize_fp_32x32(coeff, pix_num, p->round_fp, p->quant_fp, qcoeff,
+                        dqcoeff, pd->dequant, &eob, scan_order->scan,
                         scan_order->iscan);
 #endif  // CONFIG_VP9_HIGHBITDEPTH
 
diff --git a/libvpx/vp9/encoder/vp9_encoder.h b/libvpx/vp9/encoder/vp9_encoder.h
index 9774a64..1d58945 100644
--- a/libvpx/vp9/encoder/vp9_encoder.h
+++ b/libvpx/vp9/encoder/vp9_encoder.h
@@ -1196,6 +1196,13 @@
          (cpi->refresh_golden_frame && !cpi->rc.is_src_frame_alt_ref);
 }
 
+static INLINE int ref_frame_to_flag(int8_t ref_frame) {
+  static const int kVp9RefFlagList[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
+                                          VP9_ALT_FLAG };
+  assert(ref_frame >= LAST_FRAME && ref_frame <= ALTREF_FRAME);
+  return kVp9RefFlagList[ref_frame];
+}
+
 static INLINE int get_ref_frame_map_idx(const VP9_COMP *cpi,
                                         MV_REFERENCE_FRAME ref_frame) {
   if (ref_frame == LAST_FRAME) {
diff --git a/libvpx/vp9/encoder/vp9_ethread.c b/libvpx/vp9/encoder/vp9_ethread.c
index e7f8a53..453fe2e 100644
--- a/libvpx/vp9/encoder/vp9_ethread.c
+++ b/libvpx/vp9/encoder/vp9_ethread.c
@@ -8,6 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
+#include "vp9/common/vp9_thread_common.h"
+#include "vp9/encoder/vp9_bitstream.h"
 #include "vp9/encoder/vp9_encodeframe.h"
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_ethread.h"
@@ -79,60 +81,59 @@
   VP9_COMMON *const cm = &cpi->common;
   const VPxWorkerInterface *const winterface = vpx_get_worker_interface();
   int i;
+  // While using SVC, we need to allocate threads according to the highest
+  // resolution. When row based multithreading is enabled, it is OK to
+  // allocate more threads than the number of max tile columns.
+  if (cpi->use_svc && !cpi->row_mt) {
+    int max_tile_cols = get_max_tile_cols(cpi);
+    num_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+  }
+  assert(num_workers > 0);
+  if (num_workers == cpi->num_workers) return;
+  vp9_loop_filter_dealloc(&cpi->lf_row_sync);
+  vp9_bitstream_encode_tiles_buffer_dealloc(cpi);
+  vp9_encode_free_mt_data(cpi);
 
-  // Only run once to create threads and allocate thread data.
-  if (cpi->num_workers == 0) {
-    int allocated_workers = num_workers;
+  CHECK_MEM_ERROR(cm, cpi->workers,
+                  vpx_malloc(num_workers * sizeof(*cpi->workers)));
 
-    // While using SVC, we need to allocate threads according to the highest
-    // resolution. When row based multithreading is enabled, it is OK to
-    // allocate more threads than the number of max tile columns.
-    if (cpi->use_svc && !cpi->row_mt) {
-      int max_tile_cols = get_max_tile_cols(cpi);
-      allocated_workers = VPXMIN(cpi->oxcf.max_threads, max_tile_cols);
+  CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
+                  vpx_calloc(num_workers, sizeof(*cpi->tile_thr_data)));
+
+  for (i = 0; i < num_workers; i++) {
+    VPxWorker *const worker = &cpi->workers[i];
+    EncWorkerData *thread_data = &cpi->tile_thr_data[i];
+
+    ++cpi->num_workers;
+    winterface->init(worker);
+
+    if (i < num_workers - 1) {
+      thread_data->cpi = cpi;
+
+      // Allocate thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td,
+                      vpx_memalign(32, sizeof(*thread_data->td)));
+      vp9_zero(*thread_data->td);
+
+      // Set up pc_tree.
+      thread_data->td->leaf_tree = NULL;
+      thread_data->td->pc_tree = NULL;
+      vp9_setup_pc_tree(cm, thread_data->td);
+
+      // Allocate frame counters in thread data.
+      CHECK_MEM_ERROR(cm, thread_data->td->counts,
+                      vpx_calloc(1, sizeof(*thread_data->td->counts)));
+
+      // Create threads
+      if (!winterface->reset(worker))
+        vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
+                           "Tile encoder thread creation failed");
+    } else {
+      // Main thread acts as a worker and uses the thread data in cpi.
+      thread_data->cpi = cpi;
+      thread_data->td = &cpi->td;
     }
-
-    CHECK_MEM_ERROR(cm, cpi->workers,
-                    vpx_malloc(allocated_workers * sizeof(*cpi->workers)));
-
-    CHECK_MEM_ERROR(cm, cpi->tile_thr_data,
-                    vpx_calloc(allocated_workers, sizeof(*cpi->tile_thr_data)));
-
-    for (i = 0; i < allocated_workers; i++) {
-      VPxWorker *const worker = &cpi->workers[i];
-      EncWorkerData *thread_data = &cpi->tile_thr_data[i];
-
-      ++cpi->num_workers;
-      winterface->init(worker);
-
-      if (i < allocated_workers - 1) {
-        thread_data->cpi = cpi;
-
-        // Allocate thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td,
-                        vpx_memalign(32, sizeof(*thread_data->td)));
-        vp9_zero(*thread_data->td);
-
-        // Set up pc_tree.
-        thread_data->td->leaf_tree = NULL;
-        thread_data->td->pc_tree = NULL;
-        vp9_setup_pc_tree(cm, thread_data->td);
-
-        // Allocate frame counters in thread data.
-        CHECK_MEM_ERROR(cm, thread_data->td->counts,
-                        vpx_calloc(1, sizeof(*thread_data->td->counts)));
-
-        // Create threads
-        if (!winterface->reset(worker))
-          vpx_internal_error(&cm->error, VPX_CODEC_ERROR,
-                             "Tile encoder thread creation failed");
-      } else {
-        // Main thread acts as a worker and uses the thread data in cpi.
-        thread_data->cpi = cpi;
-        thread_data->td = &cpi->td;
-      }
-      winterface->sync(worker);
-    }
+    winterface->sync(worker);
   }
 }
 
@@ -169,6 +170,27 @@
   }
 }
 
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi) {
+  int t;
+  for (t = 0; t < cpi->num_workers; ++t) {
+    VPxWorker *const worker = &cpi->workers[t];
+    EncWorkerData *const thread_data = &cpi->tile_thr_data[t];
+
+    // Deallocate allocated threads.
+    vpx_get_worker_interface()->end(worker);
+
+    // Deallocate allocated thread data.
+    if (t < cpi->num_workers - 1) {
+      vpx_free(thread_data->td->counts);
+      vp9_free_pc_tree(thread_data->td);
+      vpx_free(thread_data->td);
+    }
+  }
+  vpx_free(cpi->tile_thr_data);
+  vpx_free(cpi->workers);
+  cpi->num_workers = 0;
+}
+
 void vp9_encode_tiles_mt(VP9_COMP *cpi) {
   VP9_COMMON *const cm = &cpi->common;
   const int tile_cols = 1 << cm->log2_tile_cols;
diff --git a/libvpx/vp9/encoder/vp9_ethread.h b/libvpx/vp9/encoder/vp9_ethread.h
index cda0293..4c192da 100644
--- a/libvpx/vp9/encoder/vp9_ethread.h
+++ b/libvpx/vp9/encoder/vp9_ethread.h
@@ -42,6 +42,11 @@
   int rows;
 } VP9RowMTSync;
 
+// Frees EncWorkerData related allocations made by vp9_encode_*_mt().
+// row_mt specific data is freed with vp9_row_mt_mem_dealloc() and is not
+// called by this function.
+void vp9_encode_free_mt_data(struct VP9_COMP *cpi);
+
 void vp9_encode_tiles_mt(struct VP9_COMP *cpi);
 
 void vp9_encode_tiles_row_mt(struct VP9_COMP *cpi);
diff --git a/libvpx/vp9/encoder/vp9_mbgraph.c b/libvpx/vp9/encoder/vp9_mbgraph.c
index 831c79c..7c2790c 100644
--- a/libvpx/vp9/encoder/vp9_mbgraph.c
+++ b/libvpx/vp9/encoder/vp9_mbgraph.c
@@ -219,7 +219,7 @@
   VP9_COMMON *const cm = &cpi->common;
 
   int mb_col, mb_row, offset = 0;
-  int mb_y_offset = 0, arf_y_offset = 0, gld_y_offset = 0;
+  int mb_y_offset = 0;
   MV gld_top_mv = { 0, 0 };
   MODE_INFO mi_local;
   MODE_INFO mi_above, mi_left;
@@ -243,8 +243,6 @@
   for (mb_row = 0; mb_row < cm->mb_rows; mb_row++) {
     MV gld_left_mv = gld_top_mv;
     int mb_y_in_offset = mb_y_offset;
-    int arf_y_in_offset = arf_y_offset;
-    int gld_y_in_offset = gld_y_offset;
 
     // Set up limit values for motion vectors to prevent them extending outside
     // the UMV borders.
@@ -266,8 +264,6 @@
       xd->left_mi = &mi_left;
 
       mb_y_in_offset += 16;
-      gld_y_in_offset += 16;
-      arf_y_in_offset += 16;
       x->mv_limits.col_min -= 16;
       x->mv_limits.col_max -= 16;
     }
@@ -276,8 +272,6 @@
     xd->above_mi = &mi_above;
 
     mb_y_offset += buf->y_stride * 16;
-    gld_y_offset += golden_ref->y_stride * 16;
-    if (alt_ref) arf_y_offset += alt_ref->y_stride * 16;
     x->mv_limits.row_min -= 16;
     x->mv_limits.row_max -= 16;
     offset += cm->mb_cols;
diff --git a/libvpx/vp9/encoder/vp9_mcomp.c b/libvpx/vp9/encoder/vp9_mcomp.c
index ac29f36..1f08aa5 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.c
+++ b/libvpx/vp9/encoder/vp9_mcomp.c
@@ -159,59 +159,63 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    int64_t tmpmse;                                                          \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL) {                                               \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    } else {                                                                 \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    }                                                                        \
-    tmpmse = thismse;                                                        \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
-    if (tmpmse >= INT_MAX) {                                                 \
-      v = INT_MAX;                                                           \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      int64_t tmpmse;                                                          \
+      const MV mv = { r, c };                                                  \
+      const MV ref_mv = { rr, rc };                                            \
+      if (second_pred == NULL) {                                               \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      } else {                                                                 \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      }                                                                        \
+      tmpmse = thismse;                                                        \
+      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);     \
+      if (tmpmse >= INT_MAX) {                                                 \
+        v = INT_MAX;                                                           \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER(v, r, c)                                                \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
-    const MV mv = { r, c };                                                  \
-    const MV ref_mv = { rr, rc };                                            \
-    if (second_pred == NULL)                                                 \
-      thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
-                         src_stride, &sse);                                  \
-    else                                                                     \
-      thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
-                          src_stride, &sse, second_pred);                    \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
-             thismse) < besterr) {                                           \
-      besterr = v;                                                           \
-      br = r;                                                                \
-      bc = c;                                                                \
-      *distortion = thismse;                                                 \
-      *sse1 = sse;                                                           \
-    }                                                                        \
-  } else {                                                                   \
-    v = INT_MAX;                                                             \
-  }
+#define CHECK_BETTER(v, r, c)                                                  \
+  do {                                                                         \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                    \
+      const MV mv = { r, c };                                                  \
+      const MV ref_mv = { rr, rc };                                            \
+      if (second_pred == NULL)                                                 \
+        thismse = vfp->svf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z,  \
+                           src_stride, &sse);                                  \
+      else                                                                     \
+        thismse = vfp->svaf(pre(y, y_stride, r, c), y_stride, sp(c), sp(r), z, \
+                            src_stride, &sse, second_pred);                    \
+      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +     \
+               thismse) < besterr) {                                           \
+        besterr = v;                                                           \
+        br = r;                                                                \
+        bc = c;                                                                \
+        *distortion = thismse;                                                 \
+        *sse1 = sse;                                                           \
+      }                                                                        \
+    } else {                                                                   \
+      v = INT_MAX;                                                             \
+    }                                                                          \
+  } while (0)
 
 #endif
 #define FIRST_LEVEL_CHECKS                                       \
-  {                                                              \
+  do {                                                           \
     unsigned int left, right, up, down, diag;                    \
     CHECK_BETTER(left, tr, tc - hstep);                          \
     CHECK_BETTER(right, tr, tc + hstep);                         \
@@ -224,10 +228,10 @@
       case 2: CHECK_BETTER(diag, tr + hstep, tc - hstep); break; \
       case 3: CHECK_BETTER(diag, tr + hstep, tc + hstep); break; \
     }                                                            \
-  }
+  } while (0)
 
 #define SECOND_LEVEL_CHECKS                                       \
-  {                                                               \
+  do {                                                            \
     int kr, kc;                                                   \
     unsigned int second;                                          \
     if (tr != br && tc != bc) {                                   \
@@ -256,7 +260,7 @@
         case 3: CHECK_BETTER(second, tr + kr, tc - hstep); break; \
       }                                                           \
     }                                                             \
-  }
+  } while (0)
 
 #define SETUP_SUBPEL_SEARCH                                                 \
   const uint8_t *const z = x->plane[0].src.buf;                             \
@@ -290,7 +294,7 @@
   maxr = subpel_mv_limits.row_max;                                          \
                                                                             \
   bestmv->row *= 8;                                                         \
-  bestmv->col *= 8;
+  bestmv->col *= 8
 
 static unsigned int setup_center_error(
     const MACROBLOCKD *xd, const MV *bestmv, const MV *ref_mv,
@@ -678,48 +682,52 @@
 // TODO(yunqing): this part can be further refactored.
 #if CONFIG_VP9_HIGHBITDEPTH
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    int64_t tmpmse;                                                            \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    tmpmse = thismse;                                                          \
-    tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);       \
-    if (tmpmse >= INT_MAX) {                                                   \
-      v = INT_MAX;                                                             \
-    } else if ((v = (uint32_t)tmpmse) < besterr) {                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      int64_t tmpmse;                                                         \
+      const MV mv = { r, c };                                                 \
+      const MV ref_mv = { rr, rc };                                           \
+      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      tmpmse = thismse;                                                       \
+      tmpmse += mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit);    \
+      if (tmpmse >= INT_MAX) {                                                \
+        v = INT_MAX;                                                          \
+      } else if ((v = (uint32_t)tmpmse) < besterr) {                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 #else
 /* checks if (r, c) has better score than previous best */
-#define CHECK_BETTER1(v, r, c)                                                 \
-  if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                      \
-    const MV mv = { r, c };                                                    \
-    const MV ref_mv = { rr, rc };                                              \
-    thismse =                                                                  \
-        accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z, src_stride, \
-                                y, y_stride, second_pred, w, h, &sse);         \
-    if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +       \
-             thismse) < besterr) {                                             \
-      besterr = v;                                                             \
-      br = r;                                                                  \
-      bc = c;                                                                  \
-      *distortion = thismse;                                                   \
-      *sse1 = sse;                                                             \
-    }                                                                          \
-  } else {                                                                     \
-    v = INT_MAX;                                                               \
-  }
+#define CHECK_BETTER1(v, r, c)                                                \
+  do {                                                                        \
+    if (c >= minc && c <= maxc && r >= minr && r <= maxr) {                   \
+      const MV mv = { r, c };                                                 \
+      const MV ref_mv = { rr, rc };                                           \
+      thismse = accurate_sub_pel_search(xd, &mv, x->me_sf, kernel, vfp, z,    \
+                                        src_stride, y, y_stride, second_pred, \
+                                        w, h, &sse);                          \
+      if ((v = mv_err_cost(&mv, &ref_mv, mvjcost, mvcost, error_per_bit) +    \
+               thismse) < besterr) {                                          \
+        besterr = v;                                                          \
+        br = r;                                                               \
+        bc = c;                                                               \
+        *distortion = thismse;                                                \
+        *sse1 = sse;                                                          \
+      }                                                                       \
+    } else {                                                                  \
+      v = INT_MAX;                                                            \
+    }                                                                         \
+  } while (0)
 
 #endif
 
@@ -1788,29 +1796,6 @@
   end_col = VPXMIN(center_mv->col + range, mv_limits->col_max);
   for (r = start_row; r <= end_row; r += 1) {
     c = start_col;
-    // sdx8f may not be available some block size
-    if (fn_ptr->sdx8f) {
-      while (c + 7 <= end_col) {
-        unsigned int sads[8];
-        const MV mv = { r, c };
-        const uint8_t *buf = get_buf_from_mv(pre, &mv);
-        fn_ptr->sdx8f(src->buf, src->stride, buf, pre->stride, sads);
-
-        for (i = 0; i < 8; ++i) {
-          int64_t sad = (int64_t)sads[i] << LOG2_PRECISION;
-          if (sad < best_sad) {
-            const MV mv = { r, c + i };
-            sad += lambda *
-                   vp9_nb_mvs_inconsistency(&mv, nb_full_mvs, full_mv_num);
-            if (sad < best_sad) {
-              best_sad = sad;
-              *best_mv = mv;
-            }
-          }
-        }
-        c += 8;
-      }
-    }
     while (c + 3 <= end_col) {
       unsigned int sads[4];
       const uint8_t *addrs[4];
@@ -2962,7 +2947,7 @@
   (void)sse;           \
   (void)thismse;       \
   (void)cost_list;     \
-  (void)use_accurate_subpel_search;
+  (void)use_accurate_subpel_search
 
 // Return the maximum MV.
 uint32_t vp9_return_max_sub_pixel_mv(
diff --git a/libvpx/vp9/encoder/vp9_mcomp.h b/libvpx/vp9/encoder/vp9_mcomp.h
index 0c4d8f2..bdaf2ce 100644
--- a/libvpx/vp9/encoder/vp9_mcomp.h
+++ b/libvpx/vp9/encoder/vp9_mcomp.h
@@ -93,16 +93,6 @@
 extern fractional_mv_step_fp vp9_return_max_sub_pixel_mv;
 extern fractional_mv_step_fp vp9_return_min_sub_pixel_mv;
 
-typedef int (*vp9_full_search_fn_t)(const MACROBLOCK *x, const MV *ref_mv,
-                                    int sad_per_bit, int distance,
-                                    const vp9_variance_fn_ptr_t *fn_ptr,
-                                    const MV *center_mv, MV *best_mv);
-
-typedef int (*vp9_refining_search_fn_t)(const MACROBLOCK *x, MV *ref_mv,
-                                        int sad_per_bit, int distance,
-                                        const vp9_variance_fn_ptr_t *fn_ptr,
-                                        const MV *center_mv);
-
 typedef int (*vp9_diamond_search_fn_t)(
     const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv,
     int search_param, int sad_per_bit, int *num00,
diff --git a/libvpx/vp9/encoder/vp9_multi_thread.c b/libvpx/vp9/encoder/vp9_multi_thread.c
index c66c035..45659f2 100644
--- a/libvpx/vp9/encoder/vp9_multi_thread.c
+++ b/libvpx/vp9/encoder/vp9_multi_thread.c
@@ -36,7 +36,7 @@
   pthread_mutex_lock(mutex_handle);
 #endif
   next = job_queue_hdl->next;
-  if (NULL != next) {
+  if (next != NULL) {
     JobQueue *job_queue = (JobQueue *)next;
     job_info = &job_queue->job_info;
     // Update the next job in the queue
@@ -58,9 +58,10 @@
       (mi_cols_aligned_to_sb(cm->mi_rows) >> MI_BLOCK_SIZE_LOG2) + 1;
   int i;
 
-  this_tile->row_base_thresh_freq_fact =
+  CHECK_MEM_ERROR(
+      cm, this_tile->row_base_thresh_freq_fact,
       (int *)vpx_calloc(sb_rows * BLOCK_SIZES * MAX_MODES,
-                        sizeof(*(this_tile->row_base_thresh_freq_fact)));
+                        sizeof(*(this_tile->row_base_thresh_freq_fact))));
   for (i = 0; i < sb_rows * BLOCK_SIZES * MAX_MODES; i++)
     this_tile->row_base_thresh_freq_fact[i] = RD_THRESH_INIT_FACT;
 }
@@ -84,8 +85,8 @@
   multi_thread_ctxt->allocated_tile_rows = tile_rows;
   multi_thread_ctxt->allocated_vert_unit_rows = jobs_per_tile_col;
 
-  multi_thread_ctxt->job_queue =
-      (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue));
+  CHECK_MEM_ERROR(cm, multi_thread_ctxt->job_queue,
+                  (JobQueue *)vpx_memalign(32, total_jobs * sizeof(JobQueue)));
 
 #if CONFIG_MULTITHREAD
   // Create mutex for each tile
diff --git a/libvpx/vp9/encoder/vp9_non_greedy_mv.c b/libvpx/vp9/encoder/vp9_non_greedy_mv.c
index 4679d6c..d52801c 100644
--- a/libvpx/vp9/encoder/vp9_non_greedy_mv.c
+++ b/libvpx/vp9/encoder/vp9_non_greedy_mv.c
@@ -178,6 +178,7 @@
   motion_field_info->frame_num = frame_num;
   motion_field_info->motion_field_array =
       vpx_calloc(frame_num, sizeof(*motion_field_info->motion_field_array));
+  if (!motion_field_info->motion_field_array) return STATUS_FAILED;
   for (frame_idx = 0; frame_idx < frame_num; ++frame_idx) {
     for (rf_idx = 0; rf_idx < MAX_INTER_REF_FRAMES; ++rf_idx) {
       for (square_block_idx = 0; square_block_idx < SQUARE_BLOCK_SIZES;
@@ -422,6 +423,7 @@
   int row, col;
   int bw = 4 << b_width_log2_lookup[bsize];
   int bh = 4 << b_height_log2_lookup[bsize];
+  if (!(input && output)) goto fail;
   // copy search results to input buffer
   for (idx = 0; idx < rows * cols; ++idx) {
     input[idx].row = (float)search_mf[idx].row / bh;
@@ -450,6 +452,7 @@
     smooth_mf[idx].row = (int)(input[idx].row * bh);
     smooth_mf[idx].col = (int)(input[idx].col * bw);
   }
+fail:
   free(input);
   free(output);
 }
diff --git a/libvpx/vp9/encoder/vp9_pickmode.c b/libvpx/vp9/encoder/vp9_pickmode.c
index 695fd48..697c589 100644
--- a/libvpx/vp9/encoder/vp9_pickmode.c
+++ b/libvpx/vp9/encoder/vp9_pickmode.c
@@ -771,24 +771,27 @@
         const int16_t *src_diff;
         src_diff = &p->src_diff[(r * diff_stride + c) << 2];
 
+        // skip block condition should be handled before this is called.
+        assert(!x->skip_block);
+
         switch (tx_size) {
           case TX_16X16:
             vpx_hadamard_16x16(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 256, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 256, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           case TX_8X8:
             vpx_hadamard_8x8(src_diff, diff_stride, coeff);
-            vp9_quantize_fp(coeff, 64, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 64, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
           default:
             assert(tx_size == TX_4X4);
             x->fwd_txfm4x4(src_diff, coeff, diff_stride);
-            vp9_quantize_fp(coeff, 16, x->skip_block, p->round_fp, p->quant_fp,
-                            qcoeff, dqcoeff, pd->dequant, eob, scan_order->scan,
+            vp9_quantize_fp(coeff, 16, p->round_fp, p->quant_fp, qcoeff,
+                            dqcoeff, pd->dequant, eob, scan_order->scan,
                             scan_order->iscan);
             break;
         }
@@ -1247,7 +1250,7 @@
     VP9_COMP *cpi, MACROBLOCK *x, MV_REFERENCE_FRAME ref_frame,
     int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES],
     int const_motion[MAX_REF_FRAMES], int *ref_frame_skip_mask,
-    const int flag_list[4], TileDataEnc *tile_data, int mi_row, int mi_col,
+    TileDataEnc *tile_data, int mi_row, int mi_col,
     struct buf_2d yv12_mb[4][MAX_MB_PLANE], BLOCK_SIZE bsize,
     int force_skip_low_temp_var, int comp_pred_allowed) {
   VP9_COMMON *const cm = &cpi->common;
@@ -1259,7 +1262,7 @@
   frame_mv[NEWMV][ref_frame].as_int = INVALID_MV;
   frame_mv[ZEROMV][ref_frame].as_int = 0;
   // this needs various further optimizations. to be continued..
-  if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+  if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) && (yv12 != NULL)) {
     int_mv *const candidates = x->mbmi_ext->ref_mvs[ref_frame];
     const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
     vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf, sf);
@@ -1690,8 +1693,6 @@
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   uint8_t mode_checked[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   RD_COST this_rdc, best_rdc;
   // var_y and sse_y are saved to be used in skipping checking
   unsigned int var_y = UINT_MAX;
@@ -1925,14 +1926,14 @@
   // constrain the inter mode to only test zero motion.
   if (cpi->use_svc && svc->force_zero_mode_spatial_ref &&
       svc->spatial_layer_id > 0 && !gf_temporal_ref) {
-    if (cpi->ref_frame_flags & flag_list[LAST_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_LAST_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[LAST_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[LAST_FRAME - 1] = 1;
         inter_layer_ref = LAST_FRAME;
       }
     }
-    if (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) {
+    if (cpi->ref_frame_flags & VP9_GOLD_FLAG) {
       struct scale_factors *const sf = &cm->frame_refs[GOLDEN_FRAME - 1].sf;
       if (vp9_is_scaled(sf)) {
         svc_force_zero_mode[GOLDEN_FRAME - 1] = 1;
@@ -1957,7 +1958,7 @@
                                  cpi->rc.avg_frame_low_motion < 60))
     usable_ref_frame = LAST_FRAME;
 
-  if (!((cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+  if (!((cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
         !svc_force_zero_mode[GOLDEN_FRAME - 1] && !force_skip_low_temp_var))
     use_golden_nonzeromv = 0;
 
@@ -1985,12 +1986,11 @@
     // Skip find_predictor if the reference frame is not in the
     // ref_frame_flags (i.e., not used as a reference for this frame).
     skip_ref_find_pred[ref_frame] =
-        !(cpi->ref_frame_flags & flag_list[ref_frame]);
+        !(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame));
     if (!skip_ref_find_pred[ref_frame]) {
       find_predictors(cpi, x, ref_frame, frame_mv, const_motion,
-                      &ref_frame_skip_mask, flag_list, tile_data, mi_row,
-                      mi_col, yv12_mb, bsize, force_skip_low_temp_var,
-                      comp_modes > 0);
+                      &ref_frame_skip_mask, tile_data, mi_row, mi_col, yv12_mb,
+                      bsize, force_skip_low_temp_var, comp_modes > 0);
     }
   }
 
@@ -2014,7 +2014,7 @@
   // than current layer: force check of GF-ZEROMV before early exit
   // due to skip flag.
   if (svc->spatial_layer_id > 0 && no_scaling &&
-      (cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+      (cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
       cm->base_qindex > svc->lower_layer_qindex + 10)
     force_test_gf_zeromv = 1;
 
@@ -2094,7 +2094,8 @@
     if (comp_pred) {
       if (!cpi->allow_comp_inter_inter) continue;
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, mi->segment_id, SEG_LVL_REF_FRAME)) continue;
@@ -2107,7 +2108,7 @@
          (!cpi->use_svc && sse_zeromv_normalized < thresh_skip_golden)))
       continue;
 
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) continue;
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) continue;
 
     // For screen content. If zero_temp_sad source is computed: skip
     // non-zero motion check for stationary blocks. If the superblock is
@@ -2190,7 +2191,7 @@
         if (usable_ref_frame < ALTREF_FRAME) {
           if (!force_skip_low_temp_var && usable_ref_frame > LAST_FRAME) {
             i = (ref_frame == LAST_FRAME) ? GOLDEN_FRAME : LAST_FRAME;
-            if ((cpi->ref_frame_flags & flag_list[i]))
+            if ((cpi->ref_frame_flags & ref_frame_to_flag(i)))
               if (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[i] << 1))
                 ref_frame_skip_mask |= (1 << ref_frame);
           }
@@ -2199,9 +2200,9 @@
                      ref_frame == ALTREF_FRAME)) {
           int ref1 = (ref_frame == GOLDEN_FRAME) ? LAST_FRAME : GOLDEN_FRAME;
           int ref2 = (ref_frame == ALTREF_FRAME) ? LAST_FRAME : ALTREF_FRAME;
-          if (((cpi->ref_frame_flags & flag_list[ref1]) &&
+          if (((cpi->ref_frame_flags & ref_frame_to_flag(ref1)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref1] << 1))) ||
-              ((cpi->ref_frame_flags & flag_list[ref2]) &&
+              ((cpi->ref_frame_flags & ref_frame_to_flag(ref2)) &&
                (x->pred_mv_sad[ref_frame] > (x->pred_mv_sad[ref2] << 1))))
             ref_frame_skip_mask |= (1 << ref_frame);
         }
@@ -2488,7 +2489,7 @@
     perform_intra_pred =
         svc->temporal_layer_id == 0 ||
         svc->layer_context[svc->temporal_layer_id].is_key_frame ||
-        !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) ||
+        !(cpi->ref_frame_flags & VP9_GOLD_FLAG) ||
         (!svc->layer_context[svc->temporal_layer_id].is_key_frame &&
          svc_force_zero_mode[best_pickmode.best_ref_frame - 1]);
     inter_mode_thresh = (inter_mode_thresh << 1) + inter_mode_thresh;
@@ -2747,8 +2748,6 @@
   MV_REFERENCE_FRAME best_ref_frame = NONE;
   unsigned char segment_id = mi->segment_id;
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = INT64_MAX;
   b_mode_info bsi[MAX_REF_FRAMES][4];
   int ref_frame_skip_mask = 0;
@@ -2764,7 +2763,8 @@
     int_mv dummy_mv[2];
     x->pred_mv_sad[ref_frame] = INT_MAX;
 
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) && (yv12 != NULL)) {
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
+        (yv12 != NULL)) {
       int_mv *const candidates = mbmi_ext->ref_mvs[ref_frame];
       const struct scale_factors *const sf = &cm->frame_refs[ref_frame - 1].sf;
       vp9_setup_pred_block(xd, yv12_mb[ref_frame], yv12, mi_row, mi_col, sf,
diff --git a/libvpx/vp9/encoder/vp9_quantize.c b/libvpx/vp9/encoder/vp9_quantize.c
index c996b75..9058997 100644
--- a/libvpx/vp9/encoder/vp9_quantize.c
+++ b/libvpx/vp9/encoder/vp9_quantize.c
@@ -22,15 +22,12 @@
 #include "vp9/encoder/vp9_rd.h"
 
 void vp9_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                       int skip_block, const int16_t *round_ptr,
-                       const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                       tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                       uint16_t *eob_ptr, const int16_t *scan,
-                       const int16_t *iscan) {
+                       const int16_t *round_ptr, const int16_t *quant_ptr,
+                       tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                       const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                       const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -56,7 +53,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *round_ptr,
+                              const int16_t *round_ptr,
                               const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
                               tran_low_t *dqcoeff_ptr,
                               const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -65,8 +62,6 @@
   int eob = -1;
 
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -91,15 +86,12 @@
 // TODO(jingning) Refactor this file and combine functions with similar
 // operations.
 void vp9_quantize_fp_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *round_ptr,
-                             const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                             tran_low_t *dqcoeff_ptr,
+                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
                              const int16_t *scan, const int16_t *iscan) {
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -126,15 +118,13 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vp9_highbd_quantize_fp_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *round_ptr, const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan, const int16_t *iscan) {
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *round_ptr,
+    const int16_t *quant_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+    const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan,
+    const int16_t *iscan) {
   int i, eob = -1;
 
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -176,16 +166,15 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
   if (xd->cur_buf->flags & YV12_FLAG_HIGHBITDEPTH) {
-    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs,
-                          x->skip_block, p->zbin, p->round, p->quant,
-                          p->quant_shift, qcoeff, dqcoeff, pd->dequant,
-                          &p->eobs[block], scan, iscan);
+    vpx_highbd_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin,
+                          p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
+                          pd->dequant, &p->eobs[block], scan, iscan);
     return;
   }
 #endif
-  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, x->skip_block,
-                 p->zbin, p->round, p->quant, p->quant_shift, qcoeff, dqcoeff,
-                 pd->dequant, &p->eobs[block], scan, iscan);
+  vpx_quantize_b(BLOCK_OFFSET(p->coeff, block), n_coeffs, p->zbin, p->round,
+                 p->quant, p->quant_shift, qcoeff, dqcoeff, pd->dequant,
+                 &p->eobs[block], scan, iscan);
 }
 
 static void invert_quant(int16_t *quant, int16_t *shift, int d) {
diff --git a/libvpx/vp9/encoder/vp9_ratectrl.c b/libvpx/vp9/encoder/vp9_ratectrl.c
index d0d83a8..0852973 100644
--- a/libvpx/vp9/encoder/vp9_ratectrl.c
+++ b/libvpx/vp9/encoder/vp9_ratectrl.c
@@ -223,9 +223,10 @@
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
 
   if (oxcf->rc_max_inter_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_inter_bitrate_pct / 100;
+    // target is of type int and VPXMIN cannot evaluate to larger than target
+    target = (int)VPXMIN(target, max_rate);
   }
   return target;
 }
@@ -234,9 +235,9 @@
   const RATE_CONTROL *rc = &cpi->rc;
   const VP9EncoderConfig *oxcf = &cpi->oxcf;
   if (oxcf->rc_max_intra_bitrate_pct) {
-    const int max_rate =
-        rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
-    target = VPXMIN(target, max_rate);
+    const int64_t max_rate =
+        (int64_t)rc->avg_frame_bandwidth * oxcf->rc_max_intra_bitrate_pct / 100;
+    target = (int)VPXMIN(target, max_rate);
   }
   if (target > rc->max_frame_bandwidth) target = rc->max_frame_bandwidth;
   return target;
@@ -277,9 +278,9 @@
         svc->current_superframe > 0) {
       // TODO(marpan): This may need to be modified for temporal layers.
       const double framerate_pts = 10000000.0 / ts_delta;
-      lrc->bits_off_target += (int)(lc->target_bandwidth / framerate_pts);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / framerate_pts);
     } else {
-      lrc->bits_off_target += (int)(lc->target_bandwidth / lc->framerate);
+      lrc->bits_off_target += (int)round(lc->target_bandwidth / lc->framerate);
     }
     // Clip buffer level to maximum buffer size for the layer.
     lrc->bits_off_target =
@@ -2213,7 +2214,6 @@
   // only 3 reference buffers can be updated, but for temporal layers > 1
   // we generally need to use buffer slots 4 and 5.
   if ((cm->current_video_frame == 0 && svc->number_temporal_layers > 1) ||
-      svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS ||
       svc->number_spatial_layers > 3 || svc->number_temporal_layers > 3 ||
       svc->number_spatial_layers == 1)
     return;
@@ -2234,11 +2234,15 @@
     cpi->lst_fb_idx = -1;
     cpi->gld_fb_idx = -1;
     cpi->alt_fb_idx = -1;
+    svc->update_buffer_slot[0] = 0;
     // For intra-only frame we need to refresh all slots that were
     // being used for the base layer (fb_idx_base[i] == 1).
     // Start with assigning last first, then golden and then alt.
     for (i = 0; i < REF_FRAMES; ++i) {
-      if (svc->fb_idx_base[i] == 1) count++;
+      if (svc->fb_idx_base[i] == 1) {
+        svc->update_buffer_slot[0] |= 1 << i;
+        count++;
+      }
       if (count == 1 && cpi->lst_fb_idx == -1) cpi->lst_fb_idx = i;
       if (count == 2 && cpi->gld_fb_idx == -1) cpi->gld_fb_idx = i;
       if (count == 3 && cpi->alt_fb_idx == -1) cpi->alt_fb_idx = i;
@@ -2247,6 +2251,12 @@
     // to the lst_fb_idx.
     if (cpi->gld_fb_idx == -1) cpi->gld_fb_idx = cpi->lst_fb_idx;
     if (cpi->alt_fb_idx == -1) cpi->alt_fb_idx = cpi->lst_fb_idx;
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      cpi->ext_refresh_last_frame = 0;
+      cpi->ext_refresh_golden_frame = 0;
+      cpi->ext_refresh_alt_ref_frame = 0;
+      cpi->ref_frame_flags = 0;
+    }
   }
 }
 
@@ -2389,6 +2399,9 @@
     set_intra_only_frame(cpi);
     target = vp9_calc_iframe_target_size_one_pass_cbr(cpi);
   }
+  // Overlay frame predicts from LAST (intra-only)
+  if (svc->previous_frame_is_intra_only) cpi->ref_frame_flags |= VP9_LAST_FLAG;
+
   // Any update/change of global cyclic refresh parameters (amount/delta-qp)
   // should be done here, before the frame qp is selected.
   if (cpi->oxcf.aq_mode == CYCLIC_REFRESH_AQ)
diff --git a/libvpx/vp9/encoder/vp9_rdopt.c b/libvpx/vp9/encoder/vp9_rdopt.c
index a1687dc..0171a05 100644
--- a/libvpx/vp9/encoder/vp9_rdopt.c
+++ b/libvpx/vp9/encoder/vp9_rdopt.c
@@ -3315,8 +3315,6 @@
   int_mv single_newmv[MAX_REF_FRAMES] = { { 0 } };
   INTERP_FILTER single_inter_filter[MB_MODE_COUNT][MAX_REF_FRAMES];
   int single_skippable[MB_MODE_COUNT][MAX_REF_FRAMES];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_pred_diff[REFERENCE_MODES];
   int64_t best_pred_rd[REFERENCE_MODES];
@@ -3392,7 +3390,7 @@
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
     x->pred_mv_sad[ref_frame] = INT_MAX;
-    if ((cpi->ref_frame_flags & flag_list[ref_frame]) &&
+    if ((cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) &&
         !(is_rect_partition && (ctx->skip_ref_frame_mask & (1 << ref_frame)))) {
       assert(get_ref_frame_buffer(cpi, ref_frame) != NULL);
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
@@ -3403,7 +3401,7 @@
   }
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
-    if (!(cpi->ref_frame_flags & flag_list[ref_frame])) {
+    if (!(cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
       // Skip checking missing references in both single and compound reference
       // modes. Note that a mode will be skipped if both reference frames
       // are masked out.
@@ -3609,7 +3607,8 @@
         continue;
 
       // Skip compound inter modes if ARF is not available.
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
 
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
@@ -4140,8 +4139,6 @@
   int comp_pred, i;
   int_mv frame_mv[MB_MODE_COUNT][MAX_REF_FRAMES];
   struct buf_2d yv12_mb[4][MAX_MB_PLANE];
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int64_t best_rd = best_rd_so_far;
   int64_t best_yrd = best_rd_so_far;  // FIXME(rbultje) more precise
   int64_t best_pred_diff[REFERENCE_MODES];
@@ -4191,7 +4188,7 @@
   rd_cost->rate = INT_MAX;
 
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       setup_buffer_inter(cpi, x, ref_frame, bsize, mi_row, mi_col,
                          frame_mv[NEARESTMV], frame_mv[NEARMV], yv12_mb);
     } else {
@@ -4276,7 +4273,8 @@
           cm->ref_frame_sign_bias[second_ref_frame])
         continue;
 
-      if (!(cpi->ref_frame_flags & flag_list[second_ref_frame])) continue;
+      if (!(cpi->ref_frame_flags & ref_frame_to_flag(second_ref_frame)))
+        continue;
       // Do not allow compound prediction if the segment level reference frame
       // feature is in use as in this case there can only be one reference.
       if (segfeature_active(seg, segment_id, SEG_LVL_REF_FRAME)) continue;
diff --git a/libvpx/vp9/encoder/vp9_speed_features.c b/libvpx/vp9/encoder/vp9_speed_features.c
index 81695e9..0431d8a 100644
--- a/libvpx/vp9/encoder/vp9_speed_features.c
+++ b/libvpx/vp9/encoder/vp9_speed_features.c
@@ -495,11 +495,10 @@
         (cpi->external_resize == 1 ||
          cpi->oxcf.resize_mode == RESIZE_DYNAMIC)) {
       MV_REFERENCE_FRAME ref_frame;
-      static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                        VP9_ALT_FLAG };
       for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
         const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-        if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+        if (yv12 != NULL &&
+            (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
           const struct scale_factors *const scale_fac =
               &cm->frame_refs[ref_frame - 1].sf;
           if (vp9_is_scaled(scale_fac)) sf->reference_masking = 0;
@@ -653,8 +652,10 @@
       if (cpi->content_state_sb_fd == NULL &&
           (!cpi->use_svc ||
            svc->spatial_layer_id == svc->number_spatial_layers - 1)) {
-        cpi->content_state_sb_fd = (uint8_t *)vpx_calloc(
-            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1), sizeof(uint8_t));
+        CHECK_MEM_ERROR(cm, cpi->content_state_sb_fd,
+                        (uint8_t *)vpx_calloc(
+                            (cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
+                            sizeof(uint8_t)));
       }
     }
     if (cpi->oxcf.rc_mode == VPX_CBR && content != VP9E_CONTENT_SCREEN) {
@@ -805,14 +806,17 @@
       sf->partition_search_type = FIXED_PARTITION;
       sf->always_this_block_size = BLOCK_64X64;
     }
-    if (cpi->count_arf_frame_usage == NULL)
-      cpi->count_arf_frame_usage =
+    if (cpi->count_arf_frame_usage == NULL) {
+      CHECK_MEM_ERROR(
+          cm, cpi->count_arf_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_arf_frame_usage));
+                                sizeof(*cpi->count_arf_frame_usage)));
+    }
     if (cpi->count_lastgolden_frame_usage == NULL)
-      cpi->count_lastgolden_frame_usage =
+      CHECK_MEM_ERROR(
+          cm, cpi->count_lastgolden_frame_usage,
           (uint8_t *)vpx_calloc((cm->mi_stride >> 3) * ((cm->mi_rows >> 3) + 1),
-                                sizeof(*cpi->count_lastgolden_frame_usage));
+                                sizeof(*cpi->count_lastgolden_frame_usage)));
   }
   if (svc->previous_frame_is_intra_only) {
     sf->partition_search_type = FIXED_PARTITION;
diff --git a/libvpx/vp9/encoder/vp9_subexp.c b/libvpx/vp9/encoder/vp9_subexp.c
index 19bbd53..3953253 100644
--- a/libvpx/vp9/encoder/vp9_subexp.c
+++ b/libvpx/vp9/encoder/vp9_subexp.c
@@ -114,19 +114,20 @@
   encode_term_subexp(w, delp);
 }
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd) {
-  const int old_b = cost_branch256(ct, oldp);
-  int bestsavings = 0;
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd) {
+  const int64_t old_b = cost_branch256(ct, oldp);
+  int64_t bestsavings = 0;
   vpx_prob newp, bestnewp = oldp;
   const int step = *bestp > oldp ? -1 : 1;
   const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
 
   if (old_b > upd_cost + (MIN_DELP_BITS << VP9_PROB_COST_SHIFT)) {
     for (newp = *bestp; newp != oldp; newp += step) {
-      const int new_b = cost_branch256(ct, newp);
-      const int update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
-      const int savings = old_b - new_b - update_b;
+      const int64_t new_b = cost_branch256(ct, newp);
+      const int64_t update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      const int64_t savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
         bestnewp = newp;
@@ -137,15 +138,15 @@
   return bestsavings;
 }
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize) {
-  int i, old_b, new_b, update_b, savings, bestsavings;
-  int newp;
-  const int step_sign = *bestp > oldp ? -1 : 1;
-  const int step = stepsize * step_sign;
-  const int upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize) {
+  int64_t i, old_b, new_b, update_b, savings, bestsavings;
+  int64_t newp;
+  const int64_t step_sign = *bestp > oldp ? -1 : 1;
+  const int64_t step = stepsize * step_sign;
+  const int64_t upd_cost = vp9_cost_one(upd) - vp9_cost_zero(upd);
   const vpx_prob *newplist, *oldplist;
   vpx_prob bestnewp;
   oldplist = vp9_pareto8_full[oldp - 1];
@@ -162,14 +163,14 @@
     for (newp = *bestp; (newp - oldp) * step_sign < 0; newp += step) {
       if (newp < 1 || newp > 255) continue;
       newplist = vp9_pareto8_full[newp - 1];
-      new_b = cost_branch256(ct + 2 * PIVOT_NODE, newp);
+      new_b = cost_branch256(ct + 2 * PIVOT_NODE, (vpx_prob)newp);
       for (i = UNCONSTRAINED_NODES; i < ENTROPY_NODES; ++i)
         new_b += cost_branch256(ct + 2 * i, newplist[i - UNCONSTRAINED_NODES]);
-      update_b = prob_diff_update_cost(newp, oldp) + upd_cost;
+      update_b = prob_diff_update_cost((vpx_prob)newp, oldp) + upd_cost;
       savings = old_b - new_b - update_b;
       if (savings > bestsavings) {
         bestsavings = savings;
-        bestnewp = newp;
+        bestnewp = (vpx_prob)newp;
       }
     }
   }
@@ -182,7 +183,7 @@
                                const unsigned int ct[2]) {
   const vpx_prob upd = DIFF_UPDATE_PROB;
   vpx_prob newp = get_binary_prob(ct[0], ct[1]);
-  const int savings =
+  const int64_t savings =
       vp9_prob_diff_update_savings_search(ct, *oldp, &newp, upd);
   assert(newp >= 1);
   if (savings > 0) {
diff --git a/libvpx/vp9/encoder/vp9_subexp.h b/libvpx/vp9/encoder/vp9_subexp.h
index f0d544b..2d016d2 100644
--- a/libvpx/vp9/encoder/vp9_subexp.h
+++ b/libvpx/vp9/encoder/vp9_subexp.h
@@ -25,13 +25,14 @@
 void vp9_cond_prob_diff_update(struct vpx_writer *w, vpx_prob *oldp,
                                const unsigned int ct[2]);
 
-int vp9_prob_diff_update_savings_search(const unsigned int *ct, vpx_prob oldp,
-                                        vpx_prob *bestp, vpx_prob upd);
+int64_t vp9_prob_diff_update_savings_search(const unsigned int *ct,
+                                            vpx_prob oldp, vpx_prob *bestp,
+                                            vpx_prob upd);
 
-int vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
-                                              const vpx_prob oldp,
-                                              vpx_prob *bestp, vpx_prob upd,
-                                              int stepsize);
+int64_t vp9_prob_diff_update_savings_search_model(const unsigned int *ct,
+                                                  const vpx_prob oldp,
+                                                  vpx_prob *bestp, vpx_prob upd,
+                                                  int stepsize);
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vp9/encoder/vp9_svc_layercontext.c b/libvpx/vp9/encoder/vp9_svc_layercontext.c
index ad3a8f7..a57a70a 100644
--- a/libvpx/vp9/encoder/vp9_svc_layercontext.c
+++ b/libvpx/vp9/encoder/vp9_svc_layercontext.c
@@ -462,30 +462,27 @@
   // fb_idx for that reference to the first one used/referenced.
   // This is to avoid setting fb_idx for a reference to a slot that is not
   // used/needed (i.e., since that reference is not referenced or refreshed).
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   MV_REFERENCE_FRAME ref_frame;
   MV_REFERENCE_FRAME first_ref = 0;
   int first_fb_idx = 0;
   int fb_idx[3] = { cpi->lst_fb_idx, cpi->gld_fb_idx, cpi->alt_fb_idx };
   for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ref_frame++) {
-    if (cpi->ref_frame_flags & flag_list[ref_frame]) {
+    if (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame)) {
       first_ref = ref_frame;
       first_fb_idx = fb_idx[ref_frame - 1];
       break;
     }
   }
   if (first_ref > 0) {
-    if (first_ref != LAST_FRAME &&
-        !(cpi->ref_frame_flags & flag_list[LAST_FRAME]) &&
+    if (first_ref != LAST_FRAME && !(cpi->ref_frame_flags & VP9_LAST_FLAG) &&
         !cpi->ext_refresh_last_frame)
       cpi->lst_fb_idx = first_fb_idx;
     else if (first_ref != GOLDEN_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_GOLD_FLAG) &&
              !cpi->ext_refresh_golden_frame)
       cpi->gld_fb_idx = first_fb_idx;
     else if (first_ref != ALTREF_FRAME &&
-             !(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]) &&
+             !(cpi->ref_frame_flags & VP9_ALT_FLAG) &&
              !cpi->ext_refresh_alt_ref_frame)
       cpi->alt_fb_idx = first_fb_idx;
   }
@@ -730,8 +727,6 @@
 
 void vp9_copy_flags_ref_update_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
-  static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                    VP9_ALT_FLAG };
   int sl = svc->spatial_layer_id;
   svc->lst_fb_idx[sl] = cpi->lst_fb_idx;
   svc->gld_fb_idx[sl] = cpi->gld_fb_idx;
@@ -754,12 +749,9 @@
   svc->update_golden[sl] = (uint8_t)cpi->refresh_golden_frame;
   svc->update_altref[sl] = (uint8_t)cpi->refresh_alt_ref_frame;
 
-  svc->reference_last[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[LAST_FRAME]);
-  svc->reference_golden[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[GOLDEN_FRAME]);
-  svc->reference_altref[sl] =
-      (uint8_t)(cpi->ref_frame_flags & flag_list[ALTREF_FRAME]);
+  svc->reference_last[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_LAST_FLAG);
+  svc->reference_golden[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_GOLD_FLAG);
+  svc->reference_altref[sl] = (uint8_t)(cpi->ref_frame_flags & VP9_ALT_FLAG);
 }
 
 int vp9_one_pass_cbr_svc_start_layer(VP9_COMP *const cpi) {
@@ -1080,15 +1072,14 @@
       svc->disable_inter_layer_pred == INTER_LAYER_PRED_OFF ||
       svc->drop_spatial_layer[sl - 1]) {
     MV_REFERENCE_FRAME ref_frame;
-    static const int flag_list[4] = { 0, VP9_LAST_FLAG, VP9_GOLD_FLAG,
-                                      VP9_ALT_FLAG };
     for (ref_frame = LAST_FRAME; ref_frame <= ALTREF_FRAME; ++ref_frame) {
       const YV12_BUFFER_CONFIG *yv12 = get_ref_frame_buffer(cpi, ref_frame);
-      if (yv12 != NULL && (cpi->ref_frame_flags & flag_list[ref_frame])) {
+      if (yv12 != NULL &&
+          (cpi->ref_frame_flags & ref_frame_to_flag(ref_frame))) {
         const struct scale_factors *const scale_fac =
             &cm->frame_refs[ref_frame - 1].sf;
         if (vp9_is_scaled(scale_fac)) {
-          cpi->ref_frame_flags &= (~flag_list[ref_frame]);
+          cpi->ref_frame_flags &= (~ref_frame_to_flag(ref_frame));
           // Point golden/altref frame buffer index to last.
           if (!svc->simulcast_mode) {
             if (ref_frame == GOLDEN_FRAME)
@@ -1243,6 +1234,7 @@
 
 void vp9_svc_update_ref_frame_buffer_idx(VP9_COMP *const cpi) {
   SVC *const svc = &cpi->svc;
+  int i = 0;
   // Update the usage of frame buffer index for base spatial layers.
   if (svc->spatial_layer_id == 0) {
     if ((cpi->ref_frame_flags & VP9_LAST_FLAG) || cpi->refresh_last_frame)
@@ -1251,6 +1243,11 @@
       svc->fb_idx_base[cpi->gld_fb_idx] = 1;
     if ((cpi->ref_frame_flags & VP9_ALT_FLAG) || cpi->refresh_alt_ref_frame)
       svc->fb_idx_base[cpi->alt_fb_idx] = 1;
+    // For bypass/flexible mode: check for refresh slots.
+    if (svc->temporal_layering_mode == VP9E_TEMPORAL_LAYERING_MODE_BYPASS) {
+      for (i = 0; i < REF_FRAMES; ++i)
+        if (svc->update_buffer_slot[0] & (1 << i)) svc->fb_idx_base[i] = 1;
+    }
   }
 }
 
diff --git a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
index 437f49f..87e68fb 100644
--- a/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
+++ b/libvpx/vp9/encoder/x86/temporal_filter_sse4.c
@@ -270,13 +270,11 @@
 // size 4 for the weights for each of the 4 subblocks if blk_fw is not NULL,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_luma_16(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, int use_whole_blk, uint32_t *y_accum,
-    uint16_t *y_count, const uint16_t *y_dist, const uint16_t *u_dist,
-    const uint16_t *v_dist, const int16_t *const *neighbors_first,
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
+    const int16_t *const *neighbors_first,
     const int16_t *const *neighbors_second, int top_weight, int bottom_weight,
     const int *blk_fw) {
   const int rounding = (1 << strength) >> 1;
@@ -301,7 +299,6 @@
   assert(strength <= 6);
 
   assert(block_width == 16);
-
   (void)block_width;
 
   // Initialize the weights
@@ -342,17 +339,12 @@
   accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                           y_accum);
 
-  y_src += y_src_stride;
   y_pre += y_pre_stride;
   y_count += y_pre_stride;
   y_accum += y_pre_stride;
   y_dist += DIST_STRIDE;
 
-  u_src += uv_src_stride;
-  u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
-  v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
 
   // Then all the rows except the last one
@@ -392,11 +384,7 @@
       read_chroma_dist_row_16(ss_x, u_dist, v_dist, &u_first, &u_second,
                               &v_first, &v_second);
 
-      u_src += uv_src_stride;
-      u_pre += uv_pre_stride;
       u_dist += DIST_STRIDE;
-      v_src += uv_src_stride;
-      v_pre += uv_pre_stride;
       v_dist += DIST_STRIDE;
     }
 
@@ -413,7 +401,6 @@
     accumulate_and_store_16(sum_row_first, sum_row_second, y_pre, y_count,
                             y_accum);
 
-    y_src += y_src_stride;
     y_pre += y_pre_stride;
     y_count += y_pre_stride;
     y_accum += y_pre_stride;
@@ -458,13 +445,10 @@
 
 // Perform temporal filter for the luma component.
 static void vp9_apply_temporal_filter_luma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *y_accum, uint16_t *y_count, const uint16_t *y_dist,
-    const uint16_t *u_dist, const uint16_t *v_dist) {
+    const uint8_t *y_pre, int y_pre_stride, unsigned int block_width,
+    unsigned int block_height, int ss_x, int ss_y, int strength,
+    const int *blk_fw, int use_whole_blk, uint32_t *y_accum, uint16_t *y_count,
+    const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   unsigned int blk_col = 0, uv_blk_col = 0;
   const unsigned int blk_col_step = 16, uv_blk_col_step = 16 >> ss_x;
   const unsigned int mid_width = block_width >> 1,
@@ -476,27 +460,22 @@
 
   if (block_width == 16) {
     // Special Case: The blockwidth is 16 and we are operating on a row of 16
-    // chroma pixels. In this case, we can't use the usualy left-midle-right
+    // chroma pixels. In this case, we can't use the usual left-middle-right
     // pattern. We also don't support splitting now.
     neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
     neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
     if (use_whole_blk) {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-          bottom_weight, NULL);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, top_weight, bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_luma_16(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16,
-          block_height, ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-          y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-          v_dist + uv_blk_col, neighbors_first, neighbors_second, 0, 0, blk_fw);
+          y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+          use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+          neighbors_second, 0, 0, blk_fw);
     }
 
     return;
@@ -506,9 +485,7 @@
   neighbors_first = LUMA_LEFT_COLUMN_NEIGHBORS;
   neighbors_second = LUMA_MIDDLE_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -521,13 +498,10 @@
   for (; blk_col < mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -539,21 +513,16 @@
   for (; blk_col < last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_luma_16(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, 16, block_height,
-        ss_x, ss_y, strength, use_whole_blk, y_accum + blk_col,
-        y_count + blk_col, y_dist + blk_col, u_dist + uv_blk_col,
-        v_dist + uv_blk_col, neighbors_first, neighbors_second, top_weight,
-        bottom_weight, NULL);
+        y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
+        use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
+        neighbors_second, top_weight, bottom_weight, NULL);
   }
 
   // Right
   neighbors_second = LUMA_RIGHT_COLUMN_NEIGHBORS;
   vp9_apply_temporal_filter_luma_16(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, 16, block_height, ss_x, ss_y, strength,
+      y_pre + blk_col, y_pre_stride, 16, block_height, ss_x, ss_y, strength,
       use_whole_blk, y_accum + blk_col, y_count + blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors_first,
       neighbors_second, top_weight, bottom_weight, NULL);
@@ -564,10 +533,7 @@
 // blk_fw as an array of size 4 for the weights for each of the 4 subblocks,
 // else use top_weight for top half, and bottom weight for bottom half.
 static void vp9_apply_temporal_filter_chroma_8(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int uv_block_width,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
     unsigned int uv_block_height, int ss_x, int ss_y, int strength,
     uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist,
@@ -587,9 +553,7 @@
   // Loop variable
   unsigned int h;
 
-  (void)uv_block_width;
-
-  // Initilize weight
+  // Initialize weight
   if (blk_fw) {
     weight = _mm_setr_epi16(blk_fw[0], blk_fw[0], blk_fw[0], blk_fw[0],
                             blk_fw[1], blk_fw[1], blk_fw[1], blk_fw[1]);
@@ -621,10 +585,8 @@
   accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
   accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-  u_src += uv_src_stride;
   u_pre += uv_pre_stride;
   u_dist += DIST_STRIDE;
-  v_src += uv_src_stride;
   v_pre += uv_pre_stride;
   v_dist += DIST_STRIDE;
   u_count += uv_pre_stride;
@@ -632,8 +594,6 @@
   v_count += uv_pre_stride;
   v_accum += uv_pre_stride;
 
-  y_src += y_src_stride * (1 + ss_y);
-  y_pre += y_pre_stride * (1 + ss_y);
   y_dist += DIST_STRIDE * (1 + ss_y);
 
   // Then all the rows except the last one
@@ -676,10 +636,8 @@
     accumulate_and_store_8(u_sum_row, u_pre, u_count, u_accum);
     accumulate_and_store_8(v_sum_row, v_pre, v_count, v_accum);
 
-    u_src += uv_src_stride;
     u_pre += uv_pre_stride;
     u_dist += DIST_STRIDE;
-    v_src += uv_src_stride;
     v_pre += uv_pre_stride;
     v_dist += DIST_STRIDE;
     u_count += uv_pre_stride;
@@ -687,8 +645,6 @@
     v_count += uv_pre_stride;
     v_accum += uv_pre_stride;
 
-    y_src += y_src_stride * (1 + ss_y);
-    y_pre += y_pre_stride * (1 + ss_y);
     y_dist += DIST_STRIDE * (1 + ss_y);
   }
 
@@ -719,12 +675,10 @@
 
 // Perform temporal filter for the chroma components.
 static void vp9_apply_temporal_filter_chroma(
-    const uint8_t *y_src, int y_src_stride, const uint8_t *y_pre,
-    int y_pre_stride, const uint8_t *u_src, const uint8_t *v_src,
-    int uv_src_stride, const uint8_t *u_pre, const uint8_t *v_pre,
-    int uv_pre_stride, unsigned int block_width, unsigned int block_height,
-    int ss_x, int ss_y, int strength, const int *blk_fw, int use_whole_blk,
-    uint32_t *u_accum, uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
+    const uint8_t *u_pre, const uint8_t *v_pre, int uv_pre_stride,
+    unsigned int block_width, unsigned int block_height, int ss_x, int ss_y,
+    int strength, const int *blk_fw, int use_whole_blk, uint32_t *u_accum,
+    uint16_t *u_count, uint32_t *v_accum, uint16_t *v_count,
     const uint16_t *y_dist, const uint16_t *u_dist, const uint16_t *v_dist) {
   const unsigned int uv_width = block_width >> ss_x,
                      uv_height = block_height >> ss_y;
@@ -751,22 +705,17 @@
 
     if (use_whole_blk) {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          top_weight, bottom_weight, NULL);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+          bottom_weight, NULL);
     } else {
       vp9_apply_temporal_filter_chroma_8(
-          y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-          u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-          uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-          u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-          y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-          0, 0, blk_fw);
+          u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height,
+          ss_x, ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+          v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+          u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, 0, 0, blk_fw);
     }
 
     return;
@@ -782,10 +731,8 @@
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -805,13 +752,11 @@
   for (; uv_blk_col < uv_mid_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   if (!use_whole_blk) {
@@ -823,13 +768,11 @@
   for (; uv_blk_col < uv_last_width;
        blk_col += blk_col_step, uv_blk_col += uv_blk_col_step) {
     vp9_apply_temporal_filter_chroma_8(
-        y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-        u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride,
-        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_width,
-        uv_height, ss_x, ss_y, strength, u_accum + uv_blk_col,
-        u_count + uv_blk_col, v_accum + uv_blk_col, v_count + uv_blk_col,
-        y_dist + blk_col, u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors,
-        top_weight, bottom_weight, NULL);
+        u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+        ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+        v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
+        u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
+        bottom_weight, NULL);
   }
 
   // Right
@@ -842,10 +785,8 @@
   }
 
   vp9_apply_temporal_filter_chroma_8(
-      y_src + blk_col, y_src_stride, y_pre + blk_col, y_pre_stride,
-      u_src + uv_blk_col, v_src + uv_blk_col, uv_src_stride, u_pre + uv_blk_col,
-      v_pre + uv_blk_col, uv_pre_stride, uv_width, uv_height, ss_x, ss_y,
-      strength, u_accum + uv_blk_col, u_count + uv_blk_col,
+      u_pre + uv_blk_col, v_pre + uv_blk_col, uv_pre_stride, uv_height, ss_x,
+      ss_y, strength, u_accum + uv_blk_col, u_count + uv_blk_col,
       v_accum + uv_blk_col, v_count + uv_blk_col, y_dist + blk_col,
       u_dist + uv_blk_col, v_dist + uv_blk_col, neighbors, top_weight,
       bottom_weight, NULL);
@@ -886,12 +827,12 @@
   assert(
       (use_whole_blk || (blk_fw[1] >= 0 && blk_fw[2] >= 0 && blk_fw[3] >= 0)) &&
       "subblock filter weight must be positive");
-  assert(blk_fw[0] <= 2 && "sublock filter weight must be less than 2");
+  assert(blk_fw[0] <= 2 && "subblock filter weight must be less than 2");
   assert(
       (use_whole_blk || (blk_fw[1] <= 2 && blk_fw[2] <= 2 && blk_fw[3] <= 2)) &&
       "subblock filter weight must be less than 2");
 
-  // Precompute the difference sqaured
+  // Precompute the difference squared
   for (row = 0; row < block_height; row++) {
     for (blk_col = 0; blk_col < block_width; blk_col += 16) {
       store_dist_16(y_src_ptr + blk_col, y_pre_ptr + blk_col,
@@ -922,14 +863,12 @@
   u_dist_ptr = u_dist + 1;
   v_dist_ptr = v_dist + 1;
 
-  vp9_apply_temporal_filter_luma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
-      u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
-      strength, blk_fw_ptr, use_whole_blk, y_accum, y_count, y_dist_ptr,
-      u_dist_ptr, v_dist_ptr);
+  vp9_apply_temporal_filter_luma(y_pre, y_pre_stride, block_width, block_height,
+                                 ss_x, ss_y, strength, blk_fw_ptr,
+                                 use_whole_blk, y_accum, y_count, y_dist_ptr,
+                                 u_dist_ptr, v_dist_ptr);
 
   vp9_apply_temporal_filter_chroma(
-      y_src, y_src_stride, y_pre, y_pre_stride, u_src, v_src, uv_src_stride,
       u_pre, v_pre, uv_pre_stride, block_width, block_height, ss_x, ss_y,
       strength, blk_fw_ptr, use_whole_blk, u_accum, u_count, v_accum, v_count,
       y_dist_ptr, u_dist_ptr, v_dist_ptr);
diff --git a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
index 4be6a5e..fcf50eb 100644
--- a/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
+++ b/libvpx/vp9/encoder/x86/vp9_diamond_search_sad_avx.c
@@ -282,7 +282,14 @@
 
         // Update the global minimum if the local minimum is smaller
         if (LIKELY(local_best_sad < best_sad)) {
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic push
+#pragma GCC diagnostic ignored "-Wmaybe-uninitialized"
+#endif
           new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
+#if defined(__GNUC__) && __GNUC__ >= 4 && !defined(__clang__)
+#pragma GCC diagnostic pop
+#endif
           new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];
 
           best_sad = local_best_sad;
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
index 8dfdbd5..db18b1a 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_avx2.c
@@ -47,18 +47,15 @@
 }
 
 void vp9_quantize_fp_avx2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   __m128i eob;
   __m256i round256, quant256, dequant256;
   __m256i eob256, thr256;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
index e3d803b..4bcadaa 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_sse2.c
@@ -18,11 +18,10 @@
 #include "vpx_dsp/x86/bitdepth_conversion_sse2.h"
 
 void vp9_quantize_fp_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *round_ptr,
-                          const int16_t *quant_ptr, tran_low_t *qcoeff_ptr,
-                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                          uint16_t *eob_ptr, const int16_t *scan,
-                          const int16_t *iscan) {
+                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                          const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                          const int16_t *scan, const int16_t *iscan) {
   __m128i zero;
   __m128i thr;
   int nzflag;
@@ -30,8 +29,6 @@
   __m128i round, quant, dequant;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   coeff_ptr += n_coeffs;
   iscan += n_coeffs;
diff --git a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
index 5703aa3..680acfe 100644
--- a/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
+++ b/libvpx/vp9/encoder/x86/vp9_quantize_ssse3_x86_64.asm
@@ -19,18 +19,18 @@
 SECTION .text
 
 %macro QUANTIZE_FP 2
-cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, skip, round, quant, \
+cglobal quantize_%1, 0, %2, 15, coeff, ncoeff, round, quant, \
                                 qcoeff, dqcoeff, dequant, \
                                 eob, scan, iscan
 
   ; actual quantize loop - setup pointers, rounders, etc.
   movifnidn                   coeffq, coeffmp
   movifnidn                  ncoeffq, ncoeffmp
-  mov                             r2, dequantmp
   movifnidn                   roundq, roundmp
   movifnidn                   quantq, quantmp
   mova                            m1, [roundq]             ; m1 = round
   mova                            m2, [quantq]             ; m2 = quant
+  mov                             r2, dequantmp
 %ifidn %1, fp_32x32
   pcmpeqw                         m5, m5
   psrlw                           m5, 15
diff --git a/libvpx/vp9/ratectrl_rtc.cc b/libvpx/vp9/ratectrl_rtc.cc
index 6446120..f4d7f7e 100644
--- a/libvpx/vp9/ratectrl_rtc.cc
+++ b/libvpx/vp9/ratectrl_rtc.cc
@@ -25,7 +25,10 @@
                                                 VP9RateControlRTC());
   if (!rc_api) return nullptr;
   rc_api->cpi_ = static_cast<VP9_COMP *>(vpx_memalign(32, sizeof(*cpi_)));
-  if (!rc_api->cpi_) return nullptr;
+  if (!rc_api->cpi_) {
+    rc_api.reset();
+    return nullptr;
+  }
   vp9_zero(*rc_api->cpi_);
 
   rc_api->InitRateControl(cfg);
@@ -34,6 +37,10 @@
     cpi->segmentation_map = static_cast<uint8_t *>(
         vpx_calloc(cpi->common.mi_rows * cpi->common.mi_cols,
                    sizeof(*cpi->segmentation_map)));
+    if (!cpi->segmentation_map) {
+      rc_api.reset();
+      return nullptr;
+    }
     cpi->cyclic_refresh =
         vp9_cyclic_refresh_alloc(cpi->common.mi_rows, cpi->common.mi_cols);
     cpi->cyclic_refresh->content_mode = 0;
@@ -105,7 +112,7 @@
   cpi_->framerate = rc_cfg.framerate;
   cpi_->svc.number_spatial_layers = rc_cfg.ss_number_layers;
   cpi_->svc.number_temporal_layers = rc_cfg.ts_number_layers;
-
+  vp9_set_mb_mi(cm, cm->width, cm->height);
   for (int sl = 0; sl < cpi_->svc.number_spatial_layers; ++sl) {
     for (int tl = 0; tl < cpi_->svc.number_temporal_layers; ++tl) {
       const int layer =
diff --git a/libvpx/vp9/ratectrl_rtc.h b/libvpx/vp9/ratectrl_rtc.h
index 5cc7ec9..d2b9417 100644
--- a/libvpx/vp9/ratectrl_rtc.h
+++ b/libvpx/vp9/ratectrl_rtc.h
@@ -22,28 +22,14 @@
 #include "vp9/encoder/vp9_encoder.h"
 #include "vp9/encoder/vp9_firstpass.h"
 #include "vp9/vp9_cx_iface.h"
+#include "vpx/internal/vpx_ratectrl_rtc.h"
 #include "vpx_mem/vpx_mem.h"
 
 namespace libvpx {
 
-struct VP9RateControlRtcConfig {
+struct VP9RateControlRtcConfig : public VpxRateControlRtcConfig {
  public:
   VP9RateControlRtcConfig() {
-    width = 1280;
-    height = 720;
-    max_quantizer = 63;
-    min_quantizer = 2;
-    target_bandwidth = 1000;
-    buf_initial_sz = 600;
-    buf_optimal_sz = 600;
-    buf_sz = 1000;
-    undershoot_pct = overshoot_pct = 50;
-    max_intra_bitrate_pct = 50;
-    max_inter_bitrate_pct = 0;
-    framerate = 30.0;
-    ss_number_layers = ts_number_layers = 1;
-    rc_mode = VPX_CBR;
-    aq_mode = 0;
     vp9_zero(max_quantizers);
     vp9_zero(min_quantizers);
     vp9_zero(scaling_factor_den);
@@ -52,26 +38,10 @@
     vp9_zero(ts_rate_decimator);
     scaling_factor_num[0] = 1;
     scaling_factor_den[0] = 1;
-    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
     max_quantizers[0] = max_quantizer;
     min_quantizers[0] = min_quantizer;
-    ts_rate_decimator[0] = 1;
   }
 
-  int width;
-  int height;
-  // 0-63
-  int max_quantizer;
-  int min_quantizer;
-  int64_t target_bandwidth;
-  int64_t buf_initial_sz;
-  int64_t buf_optimal_sz;
-  int64_t buf_sz;
-  int undershoot_pct;
-  int overshoot_pct;
-  int max_intra_bitrate_pct;
-  int max_inter_bitrate_pct;
-  double framerate;
   // Number of spatial layers
   int ss_number_layers;
   // Number of temporal layers
@@ -80,11 +50,6 @@
   int min_quantizers[VPX_MAX_LAYERS];
   int scaling_factor_num[VPX_SS_MAX_LAYERS];
   int scaling_factor_den[VPX_SS_MAX_LAYERS];
-  int layer_target_bitrate[VPX_MAX_LAYERS];
-  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
-  // vbr, cbr
-  enum vpx_rc_mode rc_mode;
-  int aq_mode;
 };
 
 struct VP9FrameParamsQpRTC {
@@ -94,7 +59,7 @@
 };
 
 // This interface allows using VP9 real-time rate control without initializing
-// the encoder. To use this interface, you need to link with libvp9rc.a.
+// the encoder. To use this interface, you need to link with libvpxrc.a.
 //
 // #include "vp9/ratectrl_rtc.h"
 // VP9RateControlRTC rc_api;
diff --git a/libvpx/vp9/simple_encode.cc b/libvpx/vp9/simple_encode.cc
index 6ba37a3..654699e 100644
--- a/libvpx/vp9/simple_encode.cc
+++ b/libvpx/vp9/simple_encode.cc
@@ -110,6 +110,7 @@
                               vpx_img_fmt_t img_fmt) {
   VP9_COMP *cpi;
   BufferPool *buffer_pool = (BufferPool *)vpx_calloc(1, sizeof(*buffer_pool));
+  if (!buffer_pool) return NULL;
   vp9_initialize_enc();
   cpi = vp9_create_compressor(oxcf, buffer_pool);
   vp9_update_compressor_with_img_fmt(cpi, img_fmt);
@@ -782,11 +783,12 @@
 
 static VP9EncoderConfig GetEncodeConfig(
     int frame_width, int frame_height, vpx_rational_t frame_rate,
-    int target_bitrate, int encode_speed, vpx_enc_pass enc_pass,
+    int target_bitrate, int encode_speed, int target_level,
+    vpx_enc_pass enc_pass,
     const std::vector<EncodeConfig> &encode_config_list) {
-  VP9EncoderConfig oxcf =
-      vp9_get_encoder_config(frame_width, frame_height, frame_rate,
-                             target_bitrate, encode_speed, enc_pass);
+  VP9EncoderConfig oxcf = vp9_get_encoder_config(
+      frame_width, frame_height, frame_rate, target_bitrate, encode_speed,
+      target_level, enc_pass);
   for (const auto &config : encode_config_list) {
     UpdateEncodeConfig(config, &oxcf);
   }
@@ -799,7 +801,7 @@
 
 SimpleEncode::SimpleEncode(int frame_width, int frame_height,
                            int frame_rate_num, int frame_rate_den,
-                           int target_bitrate, int num_frames,
+                           int target_bitrate, int num_frames, int target_level,
                            const char *infile_path, const char *outfile_path) {
   impl_ptr_ = std::unique_ptr<EncodeImpl>(new EncodeImpl());
   frame_width_ = frame_width;
@@ -809,6 +811,7 @@
   target_bitrate_ = target_bitrate;
   num_frames_ = num_frames;
   encode_speed_ = 0;
+  target_level_ = target_level;
 
   frame_coding_index_ = 0;
   show_frame_count_ = 0;
@@ -860,9 +863,9 @@
   }
   const vpx_rational_t frame_rate =
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
-  const VP9EncoderConfig oxcf =
-      GetEncodeConfig(frame_width_, frame_height_, frame_rate, target_bitrate_,
-                      encode_speed_, enc_pass, impl_ptr_->encode_config_list);
+  const VP9EncoderConfig oxcf = GetEncodeConfig(
+      frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
+      target_level_, enc_pass, impl_ptr_->encode_config_list);
   vp9_dump_encoder_config(&oxcf, fp);
   return StatusOk;
 }
@@ -872,7 +875,7 @@
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_FIRST_PASS, impl_ptr_->encode_config_list);
   impl_ptr_->cpi = init_encoder(&oxcf, impl_ptr_->img_fmt);
   struct lookahead_ctx *lookahead = impl_ptr_->cpi->lookahead;
   int i;
@@ -1038,7 +1041,7 @@
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
 
   vpx_fixed_buf_t stats;
   stats.buf = GetVectorData(impl_ptr_->first_pass_stats);
@@ -1266,7 +1269,7 @@
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   FRAME_INFO frame_info = vp9_get_frame_info(&oxcf);
   fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
@@ -1285,7 +1288,7 @@
       make_vpx_rational(frame_rate_num_, frame_rate_den_);
   const VP9EncoderConfig oxcf = GetEncodeConfig(
       frame_width_, frame_height_, frame_rate, target_bitrate_, encode_speed_,
-      VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
+      target_level_, VPX_RC_LAST_PASS, impl_ptr_->encode_config_list);
   TWO_PASS twopass;
   fps_init_first_pass_info(&twopass.first_pass_info,
                            GetVectorData(impl_ptr_->first_pass_stats),
diff --git a/libvpx/vp9/simple_encode.h b/libvpx/vp9/simple_encode.h
index 8ec7069..7920e95 100644
--- a/libvpx/vp9/simple_encode.h
+++ b/libvpx/vp9/simple_encode.h
@@ -44,6 +44,26 @@
   kRefFrameTypeNone = -1,
 };
 
+enum VP9_LEVEL {
+  LEVEL_UNKNOWN = 0,
+  LEVEL_AUTO = 1,
+  LEVEL_1 = 10,
+  LEVEL_1_1 = 11,
+  LEVEL_2 = 20,
+  LEVEL_2_1 = 21,
+  LEVEL_3 = 30,
+  LEVEL_3_1 = 31,
+  LEVEL_4 = 40,
+  LEVEL_4_1 = 41,
+  LEVEL_5 = 50,
+  LEVEL_5_1 = 51,
+  LEVEL_5_2 = 52,
+  LEVEL_6 = 60,
+  LEVEL_6_1 = 61,
+  LEVEL_6_2 = 62,
+  LEVEL_MAX = 255
+};
+
 enum GopMapFlag {
   kGopMapFlagStart =
       1 << 0,  // Indicate this location is the start of a group of pictures.
@@ -343,7 +363,8 @@
   // format.
   SimpleEncode(int frame_width, int frame_height, int frame_rate_num,
                int frame_rate_den, int target_bitrate, int num_frames,
-               const char *infile_path, const char *outfile_path = nullptr);
+               int target_level, const char *infile_path,
+               const char *outfile_path = nullptr);
   ~SimpleEncode();
   SimpleEncode(SimpleEncode &) = delete;
   SimpleEncode &operator=(const SimpleEncode &) = delete;
@@ -513,6 +534,7 @@
   int target_bitrate_;
   int num_frames_;
   int encode_speed_;
+  int target_level_;
 
   std::FILE *in_file_;
   std::FILE *out_file_;
diff --git a/libvpx/vp9/vp9_cx_iface.c b/libvpx/vp9/vp9_cx_iface.c
index 48d5555..05ac9e1 100644
--- a/libvpx/vp9/vp9_cx_iface.c
+++ b/libvpx/vp9/vp9_cx_iface.c
@@ -15,7 +15,6 @@
 #include "vpx/vpx_encoder.h"
 #include "vpx/vpx_ext_ratectrl.h"
 #include "vpx_dsp/psnr.h"
-#include "vpx_ports/vpx_once.h"
 #include "vpx_ports/static_assert.h"
 #include "vpx_ports/system_state.h"
 #include "vpx_util/vpx_timestamp.h"
@@ -66,7 +65,11 @@
 } vp9_extracfg;
 
 static struct vp9_extracfg default_extra_cfg = {
-  0,                     // cpu_used
+#if CONFIG_REALTIME_ONLY
+  5,  // cpu_used
+#else
+  0,  // cpu_used
+#endif
   1,                     // enable_auto_alt_ref
   0,                     // noise_sensitivity
   0,                     // sharpness
@@ -381,8 +384,8 @@
     case VPX_IMG_FMT_I440:
       if (ctx->cfg.g_profile != (unsigned int)PROFILE_1) {
         ERROR(
-            "Invalid image format. I422, I444, I440, NV12 images are "
-            "not supported in profile.");
+            "Invalid image format. I422, I444, I440 images are not supported "
+            "in profile.");
       }
       break;
     case VPX_IMG_FMT_I42216:
@@ -397,8 +400,8 @@
       break;
     default:
       ERROR(
-          "Invalid image format. Only YV12, I420, I422, I444 images are "
-          "supported.");
+          "Invalid image format. Only YV12, I420, I422, I444, I440, NV12 "
+          "images are supported.");
       break;
   }
 
@@ -523,8 +526,9 @@
   raw_target_rate =
       (unsigned int)((int64_t)oxcf->width * oxcf->height * oxcf->bit_depth * 3 *
                      oxcf->init_framerate / 1000);
-  // Cap target bitrate to raw rate
-  cfg->rc_target_bitrate = VPXMIN(raw_target_rate, cfg->rc_target_bitrate);
+  // Cap target bitrate to raw rate or 1000Mbps, whichever is less
+  cfg->rc_target_bitrate =
+      VPXMIN(VPXMIN(raw_target_rate, cfg->rc_target_bitrate), 1000000);
 
   // Convert target bandwidth from Kbit/s to Bit/s
   oxcf->target_bandwidth = 1000 * (int64_t)cfg->rc_target_bitrate;
@@ -780,7 +784,7 @@
 static vpx_codec_err_t encoder_set_config(vpx_codec_alg_priv_t *ctx,
                                           const vpx_codec_enc_cfg_t *cfg) {
   vpx_codec_err_t res;
-  int force_key = 0;
+  volatile int force_key = 0;
 
   if (cfg->g_w != ctx->cfg.g_w || cfg->g_h != ctx->cfg.g_h) {
     if (cfg->g_lag_in_frames > 1 || cfg->g_pass != VPX_RC_ONE_PASS)
@@ -799,19 +803,28 @@
     ERROR("Cannot increase lag_in_frames");
 
   res = validate_config(ctx, cfg, &ctx->extra_cfg);
+  if (res != VPX_CODEC_OK) return res;
 
-  if (res == VPX_CODEC_OK) {
-    ctx->cfg = *cfg;
-    set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
-    set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
-    // On profile change, request a key frame
-    force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
-    vp9_change_config(ctx->cpi, &ctx->oxcf);
+  if (setjmp(ctx->cpi->common.error.jmp)) {
+    const vpx_codec_err_t codec_err =
+        update_error_state(ctx, &ctx->cpi->common.error);
+    ctx->cpi->common.error.setjmp = 0;
+    vpx_clear_system_state();
+    assert(codec_err != VPX_CODEC_OK);
+    return codec_err;
   }
 
+  ctx->cfg = *cfg;
+  set_encoder_config(&ctx->oxcf, &ctx->cfg, &ctx->extra_cfg);
+  set_twopass_params_from_config(&ctx->cfg, ctx->cpi);
+  // On profile change, request a key frame
+  force_key |= ctx->cpi->common.profile != ctx->oxcf.profile;
+  vp9_change_config(ctx->cpi, &ctx->oxcf);
+
   if (force_key) ctx->next_frame_flags |= VPX_EFLAG_FORCE_KF;
 
-  return res;
+  ctx->cpi->common.error.setjmp = 0;
+  return VPX_CODEC_OK;
 }
 
 static vpx_codec_err_t ctrl_get_quantizer(vpx_codec_alg_priv_t *ctx,
@@ -1095,7 +1108,7 @@
     }
 
     priv->extra_cfg = default_extra_cfg;
-    once(vp9_initialize_enc);
+    vp9_initialize_enc();
 
     res = validate_config(priv, &priv->cfg, &priv->extra_cfg);
 
@@ -2143,6 +2156,7 @@
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
                                         int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass) {
   /* This function will generate the same VP9EncoderConfig used by the
    * vpxenc command given below.
@@ -2154,6 +2168,7 @@
    * FPS:     frame_rate
    * BITRATE: target_bitrate
    * CPU_USED:encode_speed
+   * TARGET_LEVEL: target_level
    *
    * INPUT, OUTPUT, LIMIT will not affect VP9EncoderConfig
    *
@@ -2166,6 +2181,7 @@
    * FPS=30/1
    * LIMIT=150
    * CPU_USED=0
+   * TARGET_LEVEL=0
    * ./vpxenc --limit=$LIMIT --width=$WIDTH --height=$HEIGHT --fps=$FPS
    * --lag-in-frames=25 \
    *  --codec=vp9 --good --cpu-used=CPU_USED --threads=0 --profile=0 \
@@ -2174,7 +2190,7 @@
    *  --minsection-pct=0 --maxsection-pct=150 --arnr-maxframes=7 --psnr \
    *  --arnr-strength=5 --sharpness=0 --undershoot-pct=100 --overshoot-pct=100 \
    *  --frame-parallel=0 --tile-columns=0 --cpu-used=0 --end-usage=vbr \
-   *  --target-bitrate=$BITRATE -o $OUTPUT $INPUT
+   *  --target-bitrate=$BITRATE --target-level=0 -o $OUTPUT $INPUT
    */
 
   VP9EncoderConfig oxcf;
@@ -2192,6 +2208,7 @@
   oxcf.frame_parallel_decoding_mode = 0;
   oxcf.two_pass_vbrmax_section = 150;
   oxcf.speed = abs(encode_speed);
+  oxcf.target_level = target_level;
   return oxcf;
 }
 
diff --git a/libvpx/vp9/vp9_cx_iface.h b/libvpx/vp9/vp9_cx_iface.h
index 01338ad..f2de850 100644
--- a/libvpx/vp9/vp9_cx_iface.h
+++ b/libvpx/vp9/vp9_cx_iface.h
@@ -20,6 +20,7 @@
 VP9EncoderConfig vp9_get_encoder_config(int frame_width, int frame_height,
                                         vpx_rational_t frame_rate,
                                         int target_bitrate, int encode_speed,
+                                        int target_level,
                                         vpx_enc_pass enc_pass);
 
 void vp9_dump_encoder_config(const VP9EncoderConfig *oxcf, FILE *fp);
diff --git a/libvpx/vp9/vp9_dx_iface.c b/libvpx/vp9/vp9_dx_iface.c
index 35ecbaf..3c42c7d 100644
--- a/libvpx/vp9/vp9_dx_iface.c
+++ b/libvpx/vp9/vp9_dx_iface.c
@@ -201,7 +201,7 @@
   return error->error_code;
 }
 
-static void init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
+static vpx_codec_err_t init_buffer_callbacks(vpx_codec_alg_priv_t *ctx) {
   VP9_COMMON *const cm = &ctx->pbi->common;
   BufferPool *const pool = cm->buffer_pool;
 
@@ -217,12 +217,16 @@
     pool->get_fb_cb = vp9_get_frame_buffer;
     pool->release_fb_cb = vp9_release_frame_buffer;
 
-    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers))
+    if (vp9_alloc_internal_frame_buffers(&pool->int_frame_buffers)) {
       vpx_internal_error(&cm->error, VPX_CODEC_MEM_ERROR,
                          "Failed to initialize internal frame buffers");
+      return VPX_CODEC_MEM_ERROR;
+    }
 
     pool->cb_priv = &pool->int_frame_buffers;
   }
+
+  return VPX_CODEC_OK;
 }
 
 static void set_default_ppflags(vp8_postproc_cfg_t *cfg) {
@@ -278,9 +282,7 @@
   if (!ctx->postproc_cfg_set && (ctx->base.init_flags & VPX_CODEC_USE_POSTPROC))
     set_default_ppflags(&ctx->postproc_cfg);
 
-  init_buffer_callbacks(ctx);
-
-  return VPX_CODEC_OK;
+  return init_buffer_callbacks(ctx);
 }
 
 static INLINE void check_resync(vpx_codec_alg_priv_t *const ctx,
diff --git a/libvpx/vp9/vp9cx.mk b/libvpx/vp9/vp9cx.mk
index 38e9916..92a7fdd 100644
--- a/libvpx/vp9/vp9cx.mk
+++ b/libvpx/vp9/vp9cx.mk
@@ -127,6 +127,7 @@
 
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_dct_intrin_sse2.c
 VP9_CX_SRCS-$(HAVE_SSSE3) += encoder/x86/vp9_frame_scale_ssse3.c
+VP9_CX_SRCS-$(HAVE_NEON) += encoder/arm/neon/vp9_dct_neon.c
 
 ifeq ($(CONFIG_VP9_TEMPORAL_DENOISING),yes)
 VP9_CX_SRCS-$(HAVE_SSE2) += encoder/x86/vp9_denoiser_sse2.c
diff --git a/libvpx/vpx/internal/vpx_codec_internal.h b/libvpx/vpx/internal/vpx_codec_internal.h
index 961b0bf..670fe38 100644
--- a/libvpx/vpx/internal/vpx_codec_internal.h
+++ b/libvpx/vpx/internal/vpx_codec_internal.h
@@ -435,9 +435,21 @@
 #endif
 #endif
 
+// Tells the compiler to perform `printf` format string checking if the
+// compiler supports it; see the 'format' attribute in
+// <https://gcc.gnu.org/onlinedocs/gcc/Common-Function-Attributes.html>.
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check)
+#if defined(__has_attribute)
+#if __has_attribute(format)
+#undef LIBVPX_FORMAT_PRINTF
+#define LIBVPX_FORMAT_PRINTF(string_index, first_to_check) \
+  __attribute__((__format__(__printf__, string_index, first_to_check)))
+#endif
+#endif
+
 void vpx_internal_error(struct vpx_internal_error_info *info,
-                        vpx_codec_err_t error, const char *fmt,
-                        ...) CLANG_ANALYZER_NORETURN;
+                        vpx_codec_err_t error, const char *fmt, ...)
+    LIBVPX_FORMAT_PRINTF(3, 4) CLANG_ANALYZER_NORETURN;
 
 #ifdef __cplusplus
 }  // extern "C"
diff --git a/libvpx/vpx/internal/vpx_ratectrl_rtc.h b/libvpx/vpx/internal/vpx_ratectrl_rtc.h
new file mode 100644
index 0000000..65398c6
--- /dev/null
+++ b/libvpx/vpx/internal/vpx_ratectrl_rtc.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2021 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+#define VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
+
+#include "vpx/vpx_encoder.h"
+
+namespace libvpx {
+struct VpxRateControlRtcConfig {
+ public:
+  VpxRateControlRtcConfig() {
+    width = 1280;
+    height = 720;
+    max_quantizer = 63;
+    min_quantizer = 2;
+    target_bandwidth = 1000;
+    buf_initial_sz = 600;
+    buf_optimal_sz = 600;
+    buf_sz = 1000;
+    undershoot_pct = overshoot_pct = 50;
+    max_intra_bitrate_pct = 50;
+    max_inter_bitrate_pct = 0;
+    framerate = 30.0;
+    ts_number_layers = 1;
+    rc_mode = VPX_CBR;
+    aq_mode = 0;
+    layer_target_bitrate[0] = static_cast<int>(target_bandwidth);
+    ts_rate_decimator[0] = 1;
+  }
+
+  int width;
+  int height;
+  // 0-63
+  int max_quantizer;
+  int min_quantizer;
+  int64_t target_bandwidth;
+  int64_t buf_initial_sz;
+  int64_t buf_optimal_sz;
+  int64_t buf_sz;
+  int undershoot_pct;
+  int overshoot_pct;
+  int max_intra_bitrate_pct;
+  int max_inter_bitrate_pct;
+  double framerate;
+  // Number of temporal layers
+  int ts_number_layers;
+  int layer_target_bitrate[VPX_MAX_LAYERS];
+  int ts_rate_decimator[VPX_TS_MAX_LAYERS];
+  // vbr, cbr
+  enum vpx_rc_mode rc_mode;
+  int aq_mode;
+};
+}  // namespace libvpx
+#endif  // VPX_VPX_INTERNAL_VPX_RATECTRL_RTC_H_
diff --git a/libvpx/vpx/src/vpx_encoder.c b/libvpx/vpx/src/vpx_encoder.c
index f636b54..846638f 100644
--- a/libvpx/vpx/src/vpx_encoder.c
+++ b/libvpx/vpx/src/vpx_encoder.c
@@ -173,7 +173,7 @@
 #include "vpx_ports/x86.h"
 #define FLOATING_POINT_INIT() \
   do {                        \
-    unsigned short x87_orig_mode = x87_set_double_precision();
+  unsigned short x87_orig_mode = x87_set_double_precision()
 #define FLOATING_POINT_RESTORE()       \
   x87_set_control_word(x87_orig_mode); \
   }                                    \
diff --git a/libvpx/vpx/vp8cx.h b/libvpx/vpx/vp8cx.h
index 7d0dee0..a61238c 100644
--- a/libvpx/vpx/vp8cx.h
+++ b/libvpx/vpx/vp8cx.h
@@ -33,7 +33,15 @@
  * This interface provides the capability to encode raw VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_cx_algo;
+
+/*!\brief The interface to the VP8 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -42,7 +50,15 @@
  * This interface provides the capability to encode raw VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 encoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_cx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_cx_algo;
+
+/*!\brief The interface to the VP9 encoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_cx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -494,25 +510,13 @@
    */
   VP9E_SET_COLOR_SPACE,
 
-  /*!\brief Codec control function to set temporal layering mode.
-   * \note Valid ranges: 0..3, default is "0"
-   * (VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING).
-   *                     0 = VP9E_TEMPORAL_LAYERING_MODE_NOLAYERING
-   *                     1 = VP9E_TEMPORAL_LAYERING_MODE_BYPASS
-   *                     2 = VP9E_TEMPORAL_LAYERING_MODE_0101
-   *                     3 = VP9E_TEMPORAL_LAYERING_MODE_0212
-   *
-   * Supported in codecs: VP9
-   */
-  VP9E_SET_TEMPORAL_LAYERING_MODE,
-
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
    * By default the value is set as 4.
    *
    * Supported in codecs: VP9
    */
-  VP9E_SET_MIN_GF_INTERVAL,
+  VP9E_SET_MIN_GF_INTERVAL = 48,
 
   /*!\brief Codec control function to set minimum interval between GF/ARF frames
    *
@@ -742,6 +746,17 @@
    * Supported in codecs: VP9
    */
   VP9E_GET_LAST_QUANTIZER_SVC_LAYERS,
+
+  /*!\brief Codec control to disable internal features in rate control.
+   *
+   * This will turn off cyclic refresh for vp8.
+   *
+   * With this, the rate control is expected to work exactly the same as the
+   * interface provided in vp8_ratectrl_rtc.cc/h
+   *
+   * Supported in codecs: VP8
+   */
+  VP8E_SET_RTC_EXTERNAL_RATECTRL,
 };
 
 /*!\brief vpx 1-D scaling mode
@@ -797,8 +812,8 @@
   unsigned int rows; /**< Number of rows. */
   unsigned int cols; /**< Number of columns. */
   /*! VP8 only uses the first 4 segments. VP9 uses 8 segments. */
-  int delta_q[8];  /**< Quantizer deltas. */
-  int delta_lf[8]; /**< Loop filter deltas. */
+  int delta_q[8];  /**< Quantizer deltas. Valid range: [-63, 63].*/
+  int delta_lf[8]; /**< Loop filter deltas. Valid range: [-63, 63].*/
   /*! skip and ref frame segment is only used in VP9. */
   int skip[8];      /**< Skip this block. */
   int ref_frame[8]; /**< Reference frame for this block. */
@@ -941,28 +956,12 @@
  *
  */
 
-VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
-#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
-VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
-#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
 VPX_CTRL_USE_TYPE(VP8E_SET_ROI_MAP, vpx_roi_map_t *)
 #define VPX_CTRL_VP8E_SET_ROI_MAP
-VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
-#define VPX_CTRL_VP9E_SET_ROI_MAP
 VPX_CTRL_USE_TYPE(VP8E_SET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP8E_SET_ACTIVEMAP
 VPX_CTRL_USE_TYPE(VP8E_SET_SCALEMODE, vpx_scaling_mode_t *)
 #define VPX_CTRL_VP8E_SET_SCALEMODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
-#define VPX_CTRL_VP9E_SET_SVC
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
-#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
-VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
-#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
-VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
-
 VPX_CTRL_USE_TYPE(VP8E_SET_CPUUSED, int)
 #define VPX_CTRL_VP8E_SET_CPUUSED
 VPX_CTRL_USE_TYPE(VP8E_SET_ENABLEAUTOALTREF, unsigned int)
@@ -975,7 +974,10 @@
 #define VPX_CTRL_VP8E_SET_STATIC_THRESHOLD
 VPX_CTRL_USE_TYPE(VP8E_SET_TOKEN_PARTITIONS, int) /* vp8e_token_partitions */
 #define VPX_CTRL_VP8E_SET_TOKEN_PARTITIONS
-
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
+VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
+#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_MAXFRAMES, unsigned int)
 #define VPX_CTRL_VP8E_SET_ARNR_MAXFRAMES
 VPX_CTRL_USE_TYPE(VP8E_SET_ARNR_STRENGTH, unsigned int)
@@ -986,129 +988,103 @@
 #define VPX_CTRL_VP8E_SET_TUNING
 VPX_CTRL_USE_TYPE(VP8E_SET_CQ_LEVEL, unsigned int)
 #define VPX_CTRL_VP8E_SET_CQ_LEVEL
-
+VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_FRAME_FLAGS, int)
+#define VPX_CTRL_VP8E_SET_FRAME_FLAGS
+VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
+VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
+VPX_CTRL_USE_TYPE(VP8E_SET_TEMPORAL_LAYER_ID, int)
+#define VPX_CTRL_VP8E_SET_TEMPORAL_LAYER_ID
+VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
+#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
+VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
+#define VPX_CTRL_VP9E_SET_LOSSLESS
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_COLUMNS, int)
 #define VPX_CTRL_VP9E_SET_TILE_COLUMNS
 VPX_CTRL_USE_TYPE(VP9E_SET_TILE_ROWS, int)
 #define VPX_CTRL_VP9E_SET_TILE_ROWS
-
-VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
-#define VPX_CTRL_VP9E_SET_TPL
-
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER
-VPX_CTRL_USE_TYPE(VP8E_GET_LAST_QUANTIZER_64, int *)
-#define VPX_CTRL_VP8E_GET_LAST_QUANTIZER_64
-VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
-#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
-
-VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
-#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
-
-VPX_CTRL_USE_TYPE(VP8E_SET_MAX_INTRA_BITRATE_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_MAX_INTRA_BITRATE_PCT
-VPX_CTRL_USE_TYPE(VP9E_SET_MAX_INTER_BITRATE_PCT, unsigned int)
-#define VPX_CTRL_VP9E_SET_MAX_INTER_BITRATE_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP8E_SET_SCREEN_CONTENT_MODE, unsigned int)
-#define VPX_CTRL_VP8E_SET_SCREEN_CONTENT_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_GF_CBR_BOOST_PCT, unsigned int)
-#define VPX_CTRL_VP9E_SET_GF_CBR_BOOST_PCT
-
-VPX_CTRL_USE_TYPE(VP9E_SET_LOSSLESS, unsigned int)
-#define VPX_CTRL_VP9E_SET_LOSSLESS
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PARALLEL_DECODING, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PARALLEL_DECODING
-
 VPX_CTRL_USE_TYPE(VP9E_SET_AQ_MODE, unsigned int)
 #define VPX_CTRL_VP9E_SET_AQ_MODE
-
-VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
-#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
-
 VPX_CTRL_USE_TYPE(VP9E_SET_FRAME_PERIODIC_BOOST, unsigned int)
 #define VPX_CTRL_VP9E_SET_FRAME_PERIODIC_BOOST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_NOISE_SENSITIVITY, unsigned int)
 #define VPX_CTRL_VP9E_SET_NOISE_SENSITIVITY
-
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC, int)
+#define VPX_CTRL_VP9E_SET_SVC
+VPX_CTRL_USE_TYPE(VP9E_SET_ROI_MAP, vpx_roi_map_t *)
+#define VPX_CTRL_VP9E_SET_ROI_MAP
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_PARAMETERS, void *)
+#define VPX_CTRL_VP9E_SET_SVC_PARAMETERS
+VPX_CTRL_USE_TYPE(VP9E_SET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_SET_SVC_LAYER_ID
 VPX_CTRL_USE_TYPE(VP9E_SET_TUNE_CONTENT, int) /* vp9e_tune_content */
 #define VPX_CTRL_VP9E_SET_TUNE_CONTENT
-
+VPX_CTRL_USE_TYPE(VP9E_GET_SVC_LAYER_ID, vpx_svc_layer_id_t *)
+#define VPX_CTRL_VP9E_GET_SVC_LAYER_ID
+VPX_CTRL_USE_TYPE(VP9E_REGISTER_CX_CALLBACK, void *)
+#define VPX_CTRL_VP9E_REGISTER_CX_CALLBACK
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_SPACE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_SPACE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MIN_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MIN_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_MAX_GF_INTERVAL, unsigned int)
 #define VPX_CTRL_VP9E_SET_MAX_GF_INTERVAL
-
 VPX_CTRL_USE_TYPE(VP9E_GET_ACTIVEMAP, vpx_active_map_t *)
 #define VPX_CTRL_VP9E_GET_ACTIVEMAP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_COLOR_RANGE, int)
 #define VPX_CTRL_VP9E_SET_COLOR_RANGE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_SET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_RENDER_SIZE, int *)
 #define VPX_CTRL_VP9E_SET_RENDER_SIZE
-
 VPX_CTRL_USE_TYPE(VP9E_SET_TARGET_LEVEL, unsigned int)
 #define VPX_CTRL_VP9E_SET_TARGET_LEVEL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_ROW_MT, unsigned int)
 #define VPX_CTRL_VP9E_SET_ROW_MT
-
 VPX_CTRL_USE_TYPE(VP9E_GET_LEVEL, int *)
 #define VPX_CTRL_VP9E_GET_LEVEL
-
-VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
-#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
-
+VPX_CTRL_USE_TYPE(VP9E_SET_ALT_REF_AQ, int)
+#define VPX_CTRL_VP9E_SET_ALT_REF_AQ
+VPX_CTRL_USE_TYPE(VP8E_SET_GF_CBR_BOOST_PCT, unsigned int)
+#define VPX_CTRL_VP8E_SET_GF_CBR_BOOST_PCT
 VPX_CTRL_USE_TYPE(VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST, unsigned int)
 #define VPX_CTRL_VP9E_ENABLE_MOTION_VECTOR_UNIT_TEST
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_INTER_LAYER_PRED, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_INTER_LAYER_PRED
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_FRAME_DROP_LAYER, vpx_svc_frame_drop_t *)
 #define VPX_CTRL_VP9E_SET_SVC_FRAME_DROP_LAYER
-
 VPX_CTRL_USE_TYPE(VP9E_GET_SVC_REF_FRAME_CONFIG, vpx_svc_ref_frame_config_t *)
 #define VPX_CTRL_VP9E_GET_SVC_REF_FRAME_CONFIG
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_GF_TEMPORAL_REF, unsigned int)
 #define VPX_CTRL_VP9E_SET_SVC_GF_TEMPORAL_REF
-
 VPX_CTRL_USE_TYPE(VP9E_SET_SVC_SPATIAL_LAYER_SYNC,
                   vpx_svc_spatial_layer_sync_t *)
 #define VPX_CTRL_VP9E_SET_SVC_SPATIAL_LAYER_SYNC
-
+VPX_CTRL_USE_TYPE(VP9E_SET_TPL, int)
+#define VPX_CTRL_VP9E_SET_TPL
 VPX_CTRL_USE_TYPE(VP9E_SET_POSTENCODE_DROP, unsigned int)
 #define VPX_CTRL_VP9E_SET_POSTENCODE_DROP
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DELTA_Q_UV, int)
 #define VPX_CTRL_VP9E_SET_DELTA_Q_UV
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_OVERSHOOT_MAXQ_CBR
-
 VPX_CTRL_USE_TYPE(VP9E_SET_DISABLE_LOOPFILTER, int)
 #define VPX_CTRL_VP9E_SET_DISABLE_LOOPFILTER
-
-VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
-#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
-
 VPX_CTRL_USE_TYPE(VP9E_SET_EXTERNAL_RATE_CONTROL, vpx_rc_funcs_t *)
 #define VPX_CTRL_VP9E_SET_EXTERNAL_RATE_CONTROL
+VPX_CTRL_USE_TYPE(VP9E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP9E_SET_RTC_EXTERNAL_RATECTRL
+VPX_CTRL_USE_TYPE(VP9E_GET_LOOPFILTER_LEVEL, int *)
+#define VPX_CTRL_VP9E_GET_LOOPFILTER_LEVEL
+VPX_CTRL_USE_TYPE(VP9E_GET_LAST_QUANTIZER_SVC_LAYERS, int *)
+#define VPX_CTRL_VP9E_GET_LAST_QUANTIZER_SVC_LAYERS
+VPX_CTRL_USE_TYPE(VP8E_SET_RTC_EXTERNAL_RATECTRL, int)
+#define VPX_CTRL_VP8E_SET_RTC_EXTERNAL_RATECTRL
 
 /*!\endcond */
 /*! @} - end defgroup vp8_encoder */
diff --git a/libvpx/vpx/vp8dx.h b/libvpx/vpx/vp8dx.h
index af92f21..8c13649 100644
--- a/libvpx/vpx/vp8dx.h
+++ b/libvpx/vpx/vp8dx.h
@@ -32,7 +32,15 @@
  * This interface provides the capability to decode VP8 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP8 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp8_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp8_dx_algo;
+
+/*!\brief The interface to the VP8 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp8_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -41,7 +49,15 @@
  * This interface provides the capability to decode VP9 streams.
  * @{
  */
+
+/*!\brief A single instance of the VP9 decoder.
+ *\deprecated This access mechanism is provided for backwards compatibility;
+ * prefer vpx_codec_vp9_dx().
+ */
 extern vpx_codec_iface_t vpx_codec_vp9_dx_algo;
+
+/*!\brief The interface to the VP9 decoder.
+ */
 extern vpx_codec_iface_t *vpx_codec_vp9_dx(void);
 /*!@} - end algorithm interface member group*/
 
@@ -177,28 +193,30 @@
 #define VPX_CTRL_VP8D_GET_FRAME_CORRUPTED
 VPX_CTRL_USE_TYPE(VP8D_GET_LAST_REF_USED, int *)
 #define VPX_CTRL_VP8D_GET_LAST_REF_USED
-VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
-#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VPXD_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VPXD_SET_DECRYPTOR
 VPX_CTRL_USE_TYPE(VP8D_SET_DECRYPTOR, vpx_decrypt_init *)
 #define VPX_CTRL_VP8D_SET_DECRYPTOR
+VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
+#define VPX_CTRL_VP9D_GET_FRAME_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_DISPLAY_SIZE, int *)
 #define VPX_CTRL_VP9D_GET_DISPLAY_SIZE
 VPX_CTRL_USE_TYPE(VP9D_GET_BIT_DEPTH, unsigned int *)
 #define VPX_CTRL_VP9D_GET_BIT_DEPTH
-VPX_CTRL_USE_TYPE(VP9D_GET_FRAME_SIZE, int *)
-#define VPX_CTRL_VP9D_GET_FRAME_SIZE
+VPX_CTRL_USE_TYPE(VP9_SET_BYTE_ALIGNMENT, int)
+#define VPX_CTRL_VP9_SET_BYTE_ALIGNMENT
 VPX_CTRL_USE_TYPE(VP9_INVERT_TILE_DECODE_ORDER, int)
 #define VPX_CTRL_VP9_INVERT_TILE_DECODE_ORDER
-#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
-VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
-#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
 VPX_CTRL_USE_TYPE(VP9_SET_SKIP_LOOP_FILTER, int)
-#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
+#define VPX_CTRL_VP9_SET_SKIP_LOOP_FILTER
+VPX_CTRL_USE_TYPE(VP9_DECODE_SVC_SPATIAL_LAYER, int)
+#define VPX_CTRL_VP9_DECODE_SVC_SPATIAL_LAYER
+VPX_CTRL_USE_TYPE(VPXD_GET_LAST_QUANTIZER, int *)
+#define VPX_CTRL_VPXD_GET_LAST_QUANTIZER
 VPX_CTRL_USE_TYPE(VP9D_SET_ROW_MT, int)
-#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
+#define VPX_CTRL_VP9_DECODE_SET_ROW_MT
 VPX_CTRL_USE_TYPE(VP9D_SET_LOOP_FILTER_OPT, int)
+#define VPX_CTRL_VP9_SET_LOOP_FILTER_OPT
 
 /*!\endcond */
 /*! @} - end defgroup vp8_decoder */
diff --git a/libvpx/vpx/vpx_codec.mk b/libvpx/vpx/vpx_codec.mk
index 350dc24..de86579 100644
--- a/libvpx/vpx/vpx_codec.mk
+++ b/libvpx/vpx/vpx_codec.mk
@@ -33,6 +33,7 @@
 API_SRCS-yes += src/vpx_encoder.c
 API_SRCS-yes += vpx_encoder.h
 API_SRCS-yes += internal/vpx_codec_internal.h
+API_SRCS-yes += internal/vpx_ratectrl_rtc.h
 API_SRCS-yes += src/vpx_codec.c
 API_SRCS-yes += src/vpx_image.c
 API_SRCS-yes += vpx_codec.h
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.c b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
index 6b2bebd..67f4324 100644
--- a/libvpx/vpx_dsp/arm/fdct16x16_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
+#include "vpx_dsp/arm/fdct16x16_neon.h"
 
 // Some builds of gcc 4.9.2 and .3 have trouble with some of the inline
 // functions.
@@ -27,316 +28,6 @@
 
 #else
 
-static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
-  b[0] = vld1q_s16(a);
-  a += stride;
-  b[1] = vld1q_s16(a);
-  a += stride;
-  b[2] = vld1q_s16(a);
-  a += stride;
-  b[3] = vld1q_s16(a);
-  a += stride;
-  b[4] = vld1q_s16(a);
-  a += stride;
-  b[5] = vld1q_s16(a);
-  a += stride;
-  b[6] = vld1q_s16(a);
-  a += stride;
-  b[7] = vld1q_s16(a);
-  a += stride;
-  b[8] = vld1q_s16(a);
-  a += stride;
-  b[9] = vld1q_s16(a);
-  a += stride;
-  b[10] = vld1q_s16(a);
-  a += stride;
-  b[11] = vld1q_s16(a);
-  a += stride;
-  b[12] = vld1q_s16(a);
-  a += stride;
-  b[13] = vld1q_s16(a);
-  a += stride;
-  b[14] = vld1q_s16(a);
-  a += stride;
-  b[15] = vld1q_s16(a);
-}
-
-// Store 8 16x8 values, assuming stride == 16.
-static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
-  store_s16q_to_tran_low(a, b[0]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[1]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[2]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[3]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[4]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[5]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[6]);
-  a += 16;
-  store_s16q_to_tran_low(a, b[7]);
-}
-
-// Load step of each pass. Add and subtract clear across the input, requiring
-// all 16 values to be loaded. For the first pass it also multiplies by 4.
-
-// To maybe reduce register usage this could be combined with the load() step to
-// get the first 4 and last 4 values, cross those, then load the middle 8 values
-// and cross them.
-static INLINE void cross_input(const int16x8_t *a /*[16]*/,
-                               int16x8_t *b /*[16]*/, const int pass) {
-  if (pass == 0) {
-    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
-    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
-    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
-    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
-    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
-    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
-    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
-    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
-
-    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
-    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
-    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
-    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
-    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
-    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
-    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
-    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
-  } else {
-    b[0] = vaddq_s16(a[0], a[15]);
-    b[1] = vaddq_s16(a[1], a[14]);
-    b[2] = vaddq_s16(a[2], a[13]);
-    b[3] = vaddq_s16(a[3], a[12]);
-    b[4] = vaddq_s16(a[4], a[11]);
-    b[5] = vaddq_s16(a[5], a[10]);
-    b[6] = vaddq_s16(a[6], a[9]);
-    b[7] = vaddq_s16(a[7], a[8]);
-
-    b[8] = vsubq_s16(a[7], a[8]);
-    b[9] = vsubq_s16(a[6], a[9]);
-    b[10] = vsubq_s16(a[5], a[10]);
-    b[11] = vsubq_s16(a[4], a[11]);
-    b[12] = vsubq_s16(a[3], a[12]);
-    b[13] = vsubq_s16(a[2], a[13]);
-    b[14] = vsubq_s16(a[1], a[14]);
-    b[15] = vsubq_s16(a[0], a[15]);
-  }
-}
-
-// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
-// because this only adds 1, not 1 << 2.
-static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
-  const int16x8_t one = vdupq_n_s16(1);
-  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
-  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
-  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
-  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
-  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
-  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
-  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
-  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
-  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
-  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
-  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
-  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
-  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
-  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
-  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
-  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
-}
-
-// fdct_round_shift((a +/- b) * c)
-static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_high_t c, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
-  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// fdct_round_shift(a * c0 +/- b * c1)
-static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
-                                       const tran_coef_t c0,
-                                       const tran_coef_t c1, int16x8_t *add,
-                                       int16x8_t *sub) {
-  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
-  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
-  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
-  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
-  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
-  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
-  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
-  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
-  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
-  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
-  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
-  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
-  *add = vcombine_s16(rounded0, rounded1);
-  *sub = vcombine_s16(rounded2, rounded3);
-}
-
-// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
-// are all in-place.
-static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
-                                 int16x8_t *b /*[8]*/) {
-  // Swap 16 bit elements.
-  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
-  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
-  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
-  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
-
-  // Swap 32 bit elements.
-  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
-                                   vreinterpretq_s32_s16(c1.val[0]));
-  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
-                                   vreinterpretq_s32_s16(c1.val[1]));
-  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
-                                   vreinterpretq_s32_s16(c3.val[0]));
-  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
-                                   vreinterpretq_s32_s16(c3.val[1]));
-
-  // Swap 64 bit elements
-  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
-  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
-  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
-  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
-
-  b[0] = e0.val[0];
-  b[1] = e1.val[0];
-  b[2] = e2.val[0];
-  b[3] = e3.val[0];
-  b[4] = e0.val[1];
-  b[5] = e1.val[1];
-  b[6] = e2.val[1];
-  b[7] = e3.val[1];
-}
-
-// Main body of fdct16x16.
-static void dct_body(const int16x8_t *in /*[16]*/, int16x8_t *out /*[16]*/) {
-  int16x8_t s[8];
-  int16x8_t x[4];
-  int16x8_t step[8];
-
-  // stage 1
-  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
-  // even_results);"
-  s[0] = vaddq_s16(in[0], in[7]);
-  s[1] = vaddq_s16(in[1], in[6]);
-  s[2] = vaddq_s16(in[2], in[5]);
-  s[3] = vaddq_s16(in[3], in[4]);
-  s[4] = vsubq_s16(in[3], in[4]);
-  s[5] = vsubq_s16(in[2], in[5]);
-  s[6] = vsubq_s16(in[1], in[6]);
-  s[7] = vsubq_s16(in[0], in[7]);
-
-  // fdct4(step, step);
-  x[0] = vaddq_s16(s[0], s[3]);
-  x[1] = vaddq_s16(s[1], s[2]);
-  x[2] = vsubq_s16(s[1], s[2]);
-  x[3] = vsubq_s16(s[0], s[3]);
-
-  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
-  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
-  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
-  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
-  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
-  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
-
-  //  Stage 2
-  // Re-using source s5/s6
-  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
-  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
-  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
-
-  //  Stage 3
-  x[0] = vaddq_s16(s[4], s[5]);
-  x[1] = vsubq_s16(s[4], s[5]);
-  x[2] = vsubq_s16(s[7], s[6]);
-  x[3] = vaddq_s16(s[7], s[6]);
-
-  // Stage 4
-  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
-  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
-  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
-  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
-  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
-  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
-
-  // step 2
-  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
-  // That file distinguished between "in_high" and "step1" but the only
-  // difference is that "in_high" is the first 8 values and "step 1" is the
-  // second. Here, since they are all in one array, "step1" values are += 8.
-
-  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
-  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
-  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
-  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
-  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
-  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
-
-  // step 3
-  s[0] = vaddq_s16(in[8], s[3]);
-  s[1] = vaddq_s16(in[9], s[2]);
-  x[0] = vsubq_s16(in[9], s[2]);
-  x[1] = vsubq_s16(in[8], s[3]);
-  x[2] = vsubq_s16(in[15], s[4]);
-  x[3] = vsubq_s16(in[14], s[5]);
-  s[6] = vaddq_s16(in[14], s[5]);
-  s[7] = vaddq_s16(in[15], s[4]);
-
-  // step 4
-  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
-  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
-  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
-
-  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
-  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
-  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
-
-  // step 5
-  step[0] = vaddq_s16(s[0], s[1]);
-  step[1] = vsubq_s16(s[0], s[1]);
-  step[2] = vaddq_s16(x[1], s[2]);
-  step[3] = vsubq_s16(x[1], s[2]);
-  step[4] = vsubq_s16(x[2], s[5]);
-  step[5] = vaddq_s16(x[2], s[5]);
-  step[6] = vsubq_s16(s[7], s[6]);
-  step[7] = vaddq_s16(s[7], s[6]);
-
-  // step 6
-  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
-  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
-  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
-  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
-  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
-  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
-  // cospi_22_64)
-  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
-  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
-  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
-                      &out[7]);
-  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
-                      &out[15]);
-  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
-                      &out[3]);
-  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
-                      &out[11]);
-}
-
 void vpx_fdct16x16_neon(const int16_t *input, tran_low_t *output, int stride) {
   int16x8_t temp0[16];
   int16x8_t temp1[16];
@@ -346,12 +37,12 @@
   // Left half.
   load(input, stride, temp0);
   cross_input(temp0, temp1, 0);
-  dct_body(temp1, temp0);
+  vpx_fdct16x16_body(temp1, temp0);
 
   // Right half.
   load(input + 8, stride, temp1);
   cross_input(temp1, temp2, 0);
-  dct_body(temp2, temp1);
+  vpx_fdct16x16_body(temp2, temp1);
 
   // Transpose top left and top right quarters into one contiguous location to
   // process to the top half.
@@ -359,7 +50,7 @@
   transpose_8x8(&temp1[0], &temp2[8]);
   partial_round_shift(temp2);
   cross_input(temp2, temp3, 1);
-  dct_body(temp3, temp2);
+  vpx_fdct16x16_body(temp3, temp2);
   transpose_s16_8x8(&temp2[0], &temp2[1], &temp2[2], &temp2[3], &temp2[4],
                     &temp2[5], &temp2[6], &temp2[7]);
   transpose_s16_8x8(&temp2[8], &temp2[9], &temp2[10], &temp2[11], &temp2[12],
@@ -375,7 +66,7 @@
                     &temp1[13], &temp1[14], &temp1[15]);
   partial_round_shift(temp1);
   cross_input(temp1, temp0, 1);
-  dct_body(temp0, temp1);
+  vpx_fdct16x16_body(temp0, temp1);
   transpose_s16_8x8(&temp1[0], &temp1[1], &temp1[2], &temp1[3], &temp1[4],
                     &temp1[5], &temp1[6], &temp1[7]);
   transpose_s16_8x8(&temp1[8], &temp1[9], &temp1[10], &temp1[11], &temp1[12],
diff --git a/libvpx/vpx_dsp/arm/fdct16x16_neon.h b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
new file mode 100644
index 0000000..0dd2115
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct16x16_neon.h
@@ -0,0 +1,327 @@
+/*
+ *  Copyright (c) 2017 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void load(const int16_t *a, int stride, int16x8_t *b /*[16]*/) {
+  b[0] = vld1q_s16(a);
+  a += stride;
+  b[1] = vld1q_s16(a);
+  a += stride;
+  b[2] = vld1q_s16(a);
+  a += stride;
+  b[3] = vld1q_s16(a);
+  a += stride;
+  b[4] = vld1q_s16(a);
+  a += stride;
+  b[5] = vld1q_s16(a);
+  a += stride;
+  b[6] = vld1q_s16(a);
+  a += stride;
+  b[7] = vld1q_s16(a);
+  a += stride;
+  b[8] = vld1q_s16(a);
+  a += stride;
+  b[9] = vld1q_s16(a);
+  a += stride;
+  b[10] = vld1q_s16(a);
+  a += stride;
+  b[11] = vld1q_s16(a);
+  a += stride;
+  b[12] = vld1q_s16(a);
+  a += stride;
+  b[13] = vld1q_s16(a);
+  a += stride;
+  b[14] = vld1q_s16(a);
+  a += stride;
+  b[15] = vld1q_s16(a);
+}
+
+// Store 8 16x8 values, assuming stride == 16.
+static INLINE void store(tran_low_t *a, const int16x8_t *b /*[8]*/) {
+  store_s16q_to_tran_low(a, b[0]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[1]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[2]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[3]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[4]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[5]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[6]);
+  a += 16;
+  store_s16q_to_tran_low(a, b[7]);
+}
+
+// Load step of each pass. Add and subtract clear across the input, requiring
+// all 16 values to be loaded. For the first pass it also multiplies by 4.
+
+// To maybe reduce register usage this could be combined with the load() step to
+// get the first 4 and last 4 values, cross those, then load the middle 8 values
+// and cross them.
+static INLINE void cross_input(const int16x8_t *a /*[16]*/,
+                               int16x8_t *b /*[16]*/, const int pass) {
+  if (pass == 0) {
+    b[0] = vshlq_n_s16(vaddq_s16(a[0], a[15]), 2);
+    b[1] = vshlq_n_s16(vaddq_s16(a[1], a[14]), 2);
+    b[2] = vshlq_n_s16(vaddq_s16(a[2], a[13]), 2);
+    b[3] = vshlq_n_s16(vaddq_s16(a[3], a[12]), 2);
+    b[4] = vshlq_n_s16(vaddq_s16(a[4], a[11]), 2);
+    b[5] = vshlq_n_s16(vaddq_s16(a[5], a[10]), 2);
+    b[6] = vshlq_n_s16(vaddq_s16(a[6], a[9]), 2);
+    b[7] = vshlq_n_s16(vaddq_s16(a[7], a[8]), 2);
+
+    b[8] = vshlq_n_s16(vsubq_s16(a[7], a[8]), 2);
+    b[9] = vshlq_n_s16(vsubq_s16(a[6], a[9]), 2);
+    b[10] = vshlq_n_s16(vsubq_s16(a[5], a[10]), 2);
+    b[11] = vshlq_n_s16(vsubq_s16(a[4], a[11]), 2);
+    b[12] = vshlq_n_s16(vsubq_s16(a[3], a[12]), 2);
+    b[13] = vshlq_n_s16(vsubq_s16(a[2], a[13]), 2);
+    b[14] = vshlq_n_s16(vsubq_s16(a[1], a[14]), 2);
+    b[15] = vshlq_n_s16(vsubq_s16(a[0], a[15]), 2);
+  } else {
+    b[0] = vaddq_s16(a[0], a[15]);
+    b[1] = vaddq_s16(a[1], a[14]);
+    b[2] = vaddq_s16(a[2], a[13]);
+    b[3] = vaddq_s16(a[3], a[12]);
+    b[4] = vaddq_s16(a[4], a[11]);
+    b[5] = vaddq_s16(a[5], a[10]);
+    b[6] = vaddq_s16(a[6], a[9]);
+    b[7] = vaddq_s16(a[7], a[8]);
+
+    b[8] = vsubq_s16(a[7], a[8]);
+    b[9] = vsubq_s16(a[6], a[9]);
+    b[10] = vsubq_s16(a[5], a[10]);
+    b[11] = vsubq_s16(a[4], a[11]);
+    b[12] = vsubq_s16(a[3], a[12]);
+    b[13] = vsubq_s16(a[2], a[13]);
+    b[14] = vsubq_s16(a[1], a[14]);
+    b[15] = vsubq_s16(a[0], a[15]);
+  }
+}
+
+// Quarter round at the beginning of the second pass. Can't use vrshr (rounding)
+// because this only adds 1, not 1 << 2.
+static INLINE void partial_round_shift(int16x8_t *a /*[16]*/) {
+  const int16x8_t one = vdupq_n_s16(1);
+  a[0] = vshrq_n_s16(vaddq_s16(a[0], one), 2);
+  a[1] = vshrq_n_s16(vaddq_s16(a[1], one), 2);
+  a[2] = vshrq_n_s16(vaddq_s16(a[2], one), 2);
+  a[3] = vshrq_n_s16(vaddq_s16(a[3], one), 2);
+  a[4] = vshrq_n_s16(vaddq_s16(a[4], one), 2);
+  a[5] = vshrq_n_s16(vaddq_s16(a[5], one), 2);
+  a[6] = vshrq_n_s16(vaddq_s16(a[6], one), 2);
+  a[7] = vshrq_n_s16(vaddq_s16(a[7], one), 2);
+  a[8] = vshrq_n_s16(vaddq_s16(a[8], one), 2);
+  a[9] = vshrq_n_s16(vaddq_s16(a[9], one), 2);
+  a[10] = vshrq_n_s16(vaddq_s16(a[10], one), 2);
+  a[11] = vshrq_n_s16(vaddq_s16(a[11], one), 2);
+  a[12] = vshrq_n_s16(vaddq_s16(a[12], one), 2);
+  a[13] = vshrq_n_s16(vaddq_s16(a[13], one), 2);
+  a[14] = vshrq_n_s16(vaddq_s16(a[14], one), 2);
+  a[15] = vshrq_n_s16(vaddq_s16(a[15], one), 2);
+}
+
+// fdct_round_shift((a +/- b) * c)
+static INLINE void butterfly_one_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_high_t c, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c);
+  const int32x4_t sum0 = vmlal_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t sum1 = vmlal_n_s16(a1, vget_high_s16(b), c);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// fdct_round_shift(a * c0 +/- b * c1)
+static INLINE void butterfly_two_coeff(const int16x8_t a, const int16x8_t b,
+                                       const tran_coef_t c0,
+                                       const tran_coef_t c1, int16x8_t *add,
+                                       int16x8_t *sub) {
+  const int32x4_t a0 = vmull_n_s16(vget_low_s16(a), c0);
+  const int32x4_t a1 = vmull_n_s16(vget_high_s16(a), c0);
+  const int32x4_t a2 = vmull_n_s16(vget_low_s16(a), c1);
+  const int32x4_t a3 = vmull_n_s16(vget_high_s16(a), c1);
+  const int32x4_t sum0 = vmlal_n_s16(a2, vget_low_s16(b), c0);
+  const int32x4_t sum1 = vmlal_n_s16(a3, vget_high_s16(b), c0);
+  const int32x4_t diff0 = vmlsl_n_s16(a0, vget_low_s16(b), c1);
+  const int32x4_t diff1 = vmlsl_n_s16(a1, vget_high_s16(b), c1);
+  const int16x4_t rounded0 = vqrshrn_n_s32(sum0, 14);
+  const int16x4_t rounded1 = vqrshrn_n_s32(sum1, 14);
+  const int16x4_t rounded2 = vqrshrn_n_s32(diff0, 14);
+  const int16x4_t rounded3 = vqrshrn_n_s32(diff1, 14);
+  *add = vcombine_s16(rounded0, rounded1);
+  *sub = vcombine_s16(rounded2, rounded3);
+}
+
+// Transpose 8x8 to a new location. Don't use transpose_neon.h because those
+// are all in-place.
+static INLINE void transpose_8x8(const int16x8_t *a /*[8]*/,
+                                 int16x8_t *b /*[8]*/) {
+  // Swap 16 bit elements.
+  const int16x8x2_t c0 = vtrnq_s16(a[0], a[1]);
+  const int16x8x2_t c1 = vtrnq_s16(a[2], a[3]);
+  const int16x8x2_t c2 = vtrnq_s16(a[4], a[5]);
+  const int16x8x2_t c3 = vtrnq_s16(a[6], a[7]);
+
+  // Swap 32 bit elements.
+  const int32x4x2_t d0 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[0]),
+                                   vreinterpretq_s32_s16(c1.val[0]));
+  const int32x4x2_t d1 = vtrnq_s32(vreinterpretq_s32_s16(c0.val[1]),
+                                   vreinterpretq_s32_s16(c1.val[1]));
+  const int32x4x2_t d2 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[0]),
+                                   vreinterpretq_s32_s16(c3.val[0]));
+  const int32x4x2_t d3 = vtrnq_s32(vreinterpretq_s32_s16(c2.val[1]),
+                                   vreinterpretq_s32_s16(c3.val[1]));
+
+  // Swap 64 bit elements
+  const int16x8x2_t e0 = vpx_vtrnq_s64_to_s16(d0.val[0], d2.val[0]);
+  const int16x8x2_t e1 = vpx_vtrnq_s64_to_s16(d1.val[0], d3.val[0]);
+  const int16x8x2_t e2 = vpx_vtrnq_s64_to_s16(d0.val[1], d2.val[1]);
+  const int16x8x2_t e3 = vpx_vtrnq_s64_to_s16(d1.val[1], d3.val[1]);
+
+  b[0] = e0.val[0];
+  b[1] = e1.val[0];
+  b[2] = e2.val[0];
+  b[3] = e3.val[0];
+  b[4] = e0.val[1];
+  b[5] = e1.val[1];
+  b[6] = e2.val[1];
+  b[7] = e3.val[1];
+}
+
+// Main body of fdct16x16.
+static void vpx_fdct16x16_body(const int16x8_t *in /*[16]*/,
+                               int16x8_t *out /*[16]*/) {
+  int16x8_t s[8];
+  int16x8_t x[4];
+  int16x8_t step[8];
+
+  // stage 1
+  // From fwd_txfm.c: Work on the first eight values; fdct8(input,
+  // even_results);"
+  s[0] = vaddq_s16(in[0], in[7]);
+  s[1] = vaddq_s16(in[1], in[6]);
+  s[2] = vaddq_s16(in[2], in[5]);
+  s[3] = vaddq_s16(in[3], in[4]);
+  s[4] = vsubq_s16(in[3], in[4]);
+  s[5] = vsubq_s16(in[2], in[5]);
+  s[6] = vsubq_s16(in[1], in[6]);
+  s[7] = vsubq_s16(in[0], in[7]);
+
+  // fdct4(step, step);
+  x[0] = vaddq_s16(s[0], s[3]);
+  x[1] = vaddq_s16(s[1], s[2]);
+  x[2] = vsubq_s16(s[1], s[2]);
+  x[3] = vsubq_s16(s[0], s[3]);
+
+  // out[0] = fdct_round_shift((x0 + x1) * cospi_16_64)
+  // out[8] = fdct_round_shift((x0 - x1) * cospi_16_64)
+  butterfly_one_coeff(x[0], x[1], cospi_16_64, &out[0], &out[8]);
+  // out[4] = fdct_round_shift(x3 * cospi_8_64 + x2 * cospi_24_64);
+  // out[12] = fdct_round_shift(x3 * cospi_24_64 - x2 * cospi_8_64);
+  butterfly_two_coeff(x[3], x[2], cospi_24_64, cospi_8_64, &out[4], &out[12]);
+
+  //  Stage 2
+  // Re-using source s5/s6
+  // s5 = fdct_round_shift((s6 - s5) * cospi_16_64)
+  // s6 = fdct_round_shift((s6 + s5) * cospi_16_64)
+  butterfly_one_coeff(s[6], s[5], cospi_16_64, &s[6], &s[5]);
+
+  //  Stage 3
+  x[0] = vaddq_s16(s[4], s[5]);
+  x[1] = vsubq_s16(s[4], s[5]);
+  x[2] = vsubq_s16(s[7], s[6]);
+  x[3] = vaddq_s16(s[7], s[6]);
+
+  // Stage 4
+  // out[2] = fdct_round_shift(x0 * cospi_28_64 + x3 * cospi_4_64)
+  // out[14] = fdct_round_shift(x3 * cospi_28_64 + x0 * -cospi_4_64)
+  butterfly_two_coeff(x[3], x[0], cospi_28_64, cospi_4_64, &out[2], &out[14]);
+  // out[6] = fdct_round_shift(x1 * cospi_12_64 + x2 *  cospi_20_64)
+  // out[10] = fdct_round_shift(x2 * cospi_12_64 + x1 * -cospi_20_64)
+  butterfly_two_coeff(x[2], x[1], cospi_12_64, cospi_20_64, &out[10], &out[6]);
+
+  // step 2
+  // From fwd_txfm.c: Work on the next eight values; step1 -> odd_results"
+  // That file distinguished between "in_high" and "step1" but the only
+  // difference is that "in_high" is the first 8 values and "step 1" is the
+  // second. Here, since they are all in one array, "step1" values are += 8.
+
+  // step2[2] = fdct_round_shift((step1[5] - step1[2]) * cospi_16_64)
+  // step2[3] = fdct_round_shift((step1[4] - step1[3]) * cospi_16_64)
+  // step2[4] = fdct_round_shift((step1[4] + step1[3]) * cospi_16_64)
+  // step2[5] = fdct_round_shift((step1[5] + step1[2]) * cospi_16_64)
+  butterfly_one_coeff(in[13], in[10], cospi_16_64, &s[5], &s[2]);
+  butterfly_one_coeff(in[12], in[11], cospi_16_64, &s[4], &s[3]);
+
+  // step 3
+  s[0] = vaddq_s16(in[8], s[3]);
+  s[1] = vaddq_s16(in[9], s[2]);
+  x[0] = vsubq_s16(in[9], s[2]);
+  x[1] = vsubq_s16(in[8], s[3]);
+  x[2] = vsubq_s16(in[15], s[4]);
+  x[3] = vsubq_s16(in[14], s[5]);
+  s[6] = vaddq_s16(in[14], s[5]);
+  s[7] = vaddq_s16(in[15], s[4]);
+
+  // step 4
+  // step2[1] = fdct_round_shift(step3[1] *-cospi_8_64 + step3[6] * cospi_24_64)
+  // step2[6] = fdct_round_shift(step3[1] * cospi_24_64 + step3[6] * cospi_8_64)
+  butterfly_two_coeff(s[6], s[1], cospi_24_64, cospi_8_64, &s[6], &s[1]);
+
+  // step2[2] = fdct_round_shift(step3[2] * cospi_24_64 + step3[5] * cospi_8_64)
+  // step2[5] = fdct_round_shift(step3[2] * cospi_8_64 - step3[5] * cospi_24_64)
+  butterfly_two_coeff(x[0], x[3], cospi_8_64, cospi_24_64, &s[2], &s[5]);
+
+  // step 5
+  step[0] = vaddq_s16(s[0], s[1]);
+  step[1] = vsubq_s16(s[0], s[1]);
+  step[2] = vaddq_s16(x[1], s[2]);
+  step[3] = vsubq_s16(x[1], s[2]);
+  step[4] = vsubq_s16(x[2], s[5]);
+  step[5] = vaddq_s16(x[2], s[5]);
+  step[6] = vsubq_s16(s[7], s[6]);
+  step[7] = vaddq_s16(s[7], s[6]);
+
+  // step 6
+  // out[1] = fdct_round_shift(step1[0] * cospi_30_64 + step1[7] * cospi_2_64)
+  // out[9] = fdct_round_shift(step1[1] * cospi_14_64 + step1[6] * cospi_18_64)
+  // out[5] = fdct_round_shift(step1[2] * cospi_22_64 + step1[5] * cospi_10_64)
+  // out[13] = fdct_round_shift(step1[3] * cospi_6_64 + step1[4] * cospi_26_64)
+  // out[3] = fdct_round_shift(step1[3] * -cospi_26_64 + step1[4] * cospi_6_64)
+  // out[11] = fdct_round_shift(step1[2] * -cospi_10_64 + step1[5] *
+  // cospi_22_64)
+  // out[7] = fdct_round_shift(step1[1] * -cospi_18_64 + step1[6] * cospi_14_64)
+  // out[15] = fdct_round_shift(step1[0] * -cospi_2_64 + step1[7] * cospi_30_64)
+  butterfly_two_coeff(step[6], step[1], cospi_14_64, cospi_18_64, &out[9],
+                      &out[7]);
+  butterfly_two_coeff(step[7], step[0], cospi_30_64, cospi_2_64, &out[1],
+                      &out[15]);
+  butterfly_two_coeff(step[4], step[3], cospi_6_64, cospi_26_64, &out[13],
+                      &out[3]);
+  butterfly_two_coeff(step[5], step[2], cospi_22_64, cospi_10_64, &out[5],
+                      &out[11]);
+}
+
+#endif  // VPX_VPX_DSP_ARM_FDCT16X16_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fdct32x32_neon.c b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
index e9cd349..de74e66 100644
--- a/libvpx/vpx_dsp/arm/fdct32x32_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct32x32_neon.c
@@ -153,7 +153,7 @@
   do {                                        \
     store_s16q_to_tran_low(dest, src[index]); \
     dest += 8;                                \
-  } while (0);
+  } while (0)
 
 // Store 32 16x8 values, assuming stride == 32.
 // Slight twist: store horizontally in blocks of 8.
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.c b/libvpx/vpx_dsp/arm/fdct_neon.c
index 3708cbb..2827791 100644
--- a/libvpx/vpx_dsp/arm/fdct_neon.c
+++ b/libvpx/vpx_dsp/arm/fdct_neon.c
@@ -15,6 +15,7 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 #include "vpx_dsp/arm/transpose_neon.h"
 
@@ -22,67 +23,25 @@
                       int stride) {
   int i;
   // input[M * stride] * 16
-  int16x4_t input_0 = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
-  int16x4_t input_1 = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
-  int16x4_t input_2 = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
-  int16x4_t input_3 = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
+  int16x4_t in[4];
+  in[0] = vshl_n_s16(vld1_s16(input + 0 * stride), 4);
+  in[1] = vshl_n_s16(vld1_s16(input + 1 * stride), 4);
+  in[2] = vshl_n_s16(vld1_s16(input + 2 * stride), 4);
+  in[3] = vshl_n_s16(vld1_s16(input + 3 * stride), 4);
 
   // If the very first value != 0, then add 1.
   if (input[0] != 0) {
     const int16x4_t one = vreinterpret_s16_s64(vdup_n_s64(1));
-    input_0 = vadd_s16(input_0, one);
+    in[0] = vadd_s16(in[0], one);
   }
-
   for (i = 0; i < 2; ++i) {
-    const int16x8_t input_01 = vcombine_s16(input_0, input_1);
-    const int16x8_t input_32 = vcombine_s16(input_3, input_2);
-
-    // in_0 +/- in_3, in_1 +/- in_2
-    const int16x8_t s_01 = vaddq_s16(input_01, input_32);
-    const int16x8_t s_32 = vsubq_s16(input_01, input_32);
-
-    // step_0 +/- step_1, step_2 +/- step_3
-    const int16x4_t s_0 = vget_low_s16(s_01);
-    const int16x4_t s_1 = vget_high_s16(s_01);
-    const int16x4_t s_2 = vget_high_s16(s_32);
-    const int16x4_t s_3 = vget_low_s16(s_32);
-
-    // (s_0 +/- s_1) * cospi_16_64
-    // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
-    const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
-    const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
-    const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
-    const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
-
-    // fdct_round_shift
-    int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
-    int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
-
-    // s_3 * cospi_8_64 + s_2 * cospi_24_64
-    // s_3 * cospi_24_64 - s_2 * cospi_8_64
-    const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
-    const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
-
-    const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
-    const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
-
-    // fdct_round_shift
-    int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
-    int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
-
-    transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
-
-    input_0 = out_0;
-    input_1 = out_1;
-    input_2 = out_2;
-    input_3 = out_3;
+    vpx_fdct4x4_pass1_neon(in);
   }
-
   {
     // Not quite a rounding shift. Only add 1 despite shifting by 2.
     const int16x8_t one = vdupq_n_s16(1);
-    int16x8_t out_01 = vcombine_s16(input_0, input_1);
-    int16x8_t out_23 = vcombine_s16(input_2, input_3);
+    int16x8_t out_01 = vcombine_s16(in[0], in[1]);
+    int16x8_t out_23 = vcombine_s16(in[2], in[3]);
     out_01 = vshrq_n_s16(vaddq_s16(out_01, one), 2);
     out_23 = vshrq_n_s16(vaddq_s16(out_23, one), 2);
     store_s16q_to_tran_low(final_output + 0 * 8, out_01);
diff --git a/libvpx/vpx_dsp/arm/fdct_neon.h b/libvpx/vpx_dsp/arm/fdct_neon.h
new file mode 100644
index 0000000..28d7d86
--- /dev/null
+++ b/libvpx/vpx_dsp/arm/fdct_neon.h
@@ -0,0 +1,213 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_ARM_FDCT_NEON_H_
+#define VPX_VPX_DSP_ARM_FDCT_NEON_H_
+
+#include <arm_neon.h>
+
+static INLINE void vpx_fdct4x4_pass1_neon(int16x4_t *in) {
+  const int16x8_t input_01 = vcombine_s16(in[0], in[1]);
+  const int16x8_t input_32 = vcombine_s16(in[3], in[2]);
+
+  // in_0 +/- in_3, in_1 +/- in_2
+  const int16x8_t s_01 = vaddq_s16(input_01, input_32);
+  const int16x8_t s_32 = vsubq_s16(input_01, input_32);
+
+  // step_0 +/- step_1, step_2 +/- step_3
+  const int16x4_t s_0 = vget_low_s16(s_01);
+  const int16x4_t s_1 = vget_high_s16(s_01);
+  const int16x4_t s_2 = vget_high_s16(s_32);
+  const int16x4_t s_3 = vget_low_s16(s_32);
+
+  // (s_0 +/- s_1) * cospi_16_64
+  // Must expand all elements to s32. See 'needs32' comment in fwd_txfm.c.
+  const int32x4_t s_0_p_s_1 = vaddl_s16(s_0, s_1);
+  const int32x4_t s_0_m_s_1 = vsubl_s16(s_0, s_1);
+  const int32x4_t temp1 = vmulq_n_s32(s_0_p_s_1, cospi_16_64);
+  const int32x4_t temp2 = vmulq_n_s32(s_0_m_s_1, cospi_16_64);
+
+  // fdct_round_shift
+  int16x4_t out_0 = vrshrn_n_s32(temp1, DCT_CONST_BITS);
+  int16x4_t out_2 = vrshrn_n_s32(temp2, DCT_CONST_BITS);
+
+  // s_3 * cospi_8_64 + s_2 * cospi_24_64
+  // s_3 * cospi_24_64 - s_2 * cospi_8_64
+  const int32x4_t s_3_cospi_8_64 = vmull_n_s16(s_3, cospi_8_64);
+  const int32x4_t s_3_cospi_24_64 = vmull_n_s16(s_3, cospi_24_64);
+
+  const int32x4_t temp3 = vmlal_n_s16(s_3_cospi_8_64, s_2, cospi_24_64);
+  const int32x4_t temp4 = vmlsl_n_s16(s_3_cospi_24_64, s_2, cospi_8_64);
+
+  // fdct_round_shift
+  int16x4_t out_1 = vrshrn_n_s32(temp3, DCT_CONST_BITS);
+  int16x4_t out_3 = vrshrn_n_s32(temp4, DCT_CONST_BITS);
+
+  transpose_s16_4x4d(&out_0, &out_1, &out_2, &out_3);
+
+  in[0] = out_0;
+  in[1] = out_1;
+  in[2] = out_2;
+  in[3] = out_3;
+}
+
+static INLINE void vpx_fdct8x8_pass1_notranspose_neon(int16x8_t *in,
+                                                      int16x8_t *out) {
+  const int16x8_t v_s0 = vaddq_s16(in[0], in[7]);
+  const int16x8_t v_s1 = vaddq_s16(in[1], in[6]);
+  const int16x8_t v_s2 = vaddq_s16(in[2], in[5]);
+  const int16x8_t v_s3 = vaddq_s16(in[3], in[4]);
+  const int16x8_t v_s4 = vsubq_s16(in[3], in[4]);
+  const int16x8_t v_s5 = vsubq_s16(in[2], in[5]);
+  const int16x8_t v_s6 = vsubq_s16(in[1], in[6]);
+  const int16x8_t v_s7 = vsubq_s16(in[0], in[7]);
+  // fdct4(step, step);
+  int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
+  int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
+  int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
+  int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
+  // fdct4(step, step);
+  int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
+  int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
+  int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
+  int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
+  int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
+  int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
+  v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
+  v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
+  v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
+  v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
+  v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
+  v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[0] = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
+    out[2] = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
+    out[4] = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
+    out[6] = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
+  }
+  // Stage 2
+  v_x0 = vsubq_s16(v_s6, v_s5);
+  v_x1 = vaddq_s16(v_s6, v_s5);
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x8_t ab = vcombine_s16(a, b);
+    const int16x8_t cd = vcombine_s16(c, d);
+    // Stage 3
+    v_x0 = vaddq_s16(v_s4, ab);
+    v_x1 = vsubq_s16(v_s4, ab);
+    v_x2 = vsubq_s16(v_s7, cd);
+    v_x3 = vaddq_s16(v_s7, cd);
+  }
+  // Stage 4
+  v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
+  v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
+  v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
+  v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
+  v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
+  v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
+  v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
+  v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
+  v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
+  v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
+  v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
+  v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
+  v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
+  v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
+  v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
+  v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
+  {
+    const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
+    const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
+    const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
+    const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
+    const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
+    const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
+    const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
+    const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
+    out[1] = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
+    out[3] = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
+    out[5] = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
+    out[7] = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
+  }
+}
+
+static INLINE void vpx_fdct8x8_pass1_neon(int16x8_t *in) {
+  int16x8_t out[8];
+  vpx_fdct8x8_pass1_notranspose_neon(in, out);
+  // transpose 8x8
+  // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
+  // columns.
+  {
+    // 00 01 02 03 40 41 42 43
+    // 10 11 12 13 50 51 52 53
+    // 20 21 22 23 60 61 62 63
+    // 30 31 32 33 70 71 72 73
+    // 04 05 06 07 44 45 46 47
+    // 14 15 16 17 54 55 56 57
+    // 24 25 26 27 64 65 66 67
+    // 34 35 36 37 74 75 76 77
+    const int32x4x2_t r02_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[0]), vreinterpretq_s32_s16(out[2]));
+    const int32x4x2_t r13_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[1]), vreinterpretq_s32_s16(out[3]));
+    const int32x4x2_t r46_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[4]), vreinterpretq_s32_s16(out[6]));
+    const int32x4x2_t r57_s32 =
+        vtrnq_s32(vreinterpretq_s32_s16(out[5]), vreinterpretq_s32_s16(out[7]));
+    const int16x8x2_t r01_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
+                  vreinterpretq_s16_s32(r13_s32.val[0]));
+    const int16x8x2_t r23_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
+                  vreinterpretq_s16_s32(r13_s32.val[1]));
+    const int16x8x2_t r45_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
+                  vreinterpretq_s16_s32(r57_s32.val[0]));
+    const int16x8x2_t r67_s16 =
+        vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
+                  vreinterpretq_s16_s32(r57_s32.val[1]));
+    in[0] = r01_s16.val[0];
+    in[1] = r01_s16.val[1];
+    in[2] = r23_s16.val[0];
+    in[3] = r23_s16.val[1];
+    in[4] = r45_s16.val[0];
+    in[5] = r45_s16.val[1];
+    in[6] = r67_s16.val[0];
+    in[7] = r67_s16.val[1];
+    // 00 10 20 30 40 50 60 70
+    // 01 11 21 31 41 51 61 71
+    // 02 12 22 32 42 52 62 72
+    // 03 13 23 33 43 53 63 73
+    // 04 14 24 34 44 54 64 74
+    // 05 15 25 35 45 55 65 75
+    // 06 16 26 36 46 56 66 76
+    // 07 17 27 37 47 57 67 77
+  }
+}
+#endif  // VPX_VPX_DSP_ARM_FDCT_NEON_H_
diff --git a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
index 374a262..d9161c6 100644
--- a/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
+++ b/libvpx/vpx_dsp/arm/fwd_txfm_neon.c
@@ -15,196 +15,54 @@
 #include "vpx_dsp/txfm_common.h"
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_dsp/arm/idct_neon.h"
+#include "vpx_dsp/arm/fdct_neon.h"
 #include "vpx_dsp/arm/mem_neon.h"
 
 void vpx_fdct8x8_neon(const int16_t *input, tran_low_t *final_output,
                       int stride) {
   int i;
   // stage 1
-  int16x8_t input_0 = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
-  int16x8_t input_1 = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
-  int16x8_t input_2 = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
-  int16x8_t input_3 = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
-  int16x8_t input_4 = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
-  int16x8_t input_5 = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
-  int16x8_t input_6 = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
-  int16x8_t input_7 = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
+  int16x8_t in[8];
+  in[0] = vshlq_n_s16(vld1q_s16(&input[0 * stride]), 2);
+  in[1] = vshlq_n_s16(vld1q_s16(&input[1 * stride]), 2);
+  in[2] = vshlq_n_s16(vld1q_s16(&input[2 * stride]), 2);
+  in[3] = vshlq_n_s16(vld1q_s16(&input[3 * stride]), 2);
+  in[4] = vshlq_n_s16(vld1q_s16(&input[4 * stride]), 2);
+  in[5] = vshlq_n_s16(vld1q_s16(&input[5 * stride]), 2);
+  in[6] = vshlq_n_s16(vld1q_s16(&input[6 * stride]), 2);
+  in[7] = vshlq_n_s16(vld1q_s16(&input[7 * stride]), 2);
   for (i = 0; i < 2; ++i) {
-    int16x8_t out_0, out_1, out_2, out_3, out_4, out_5, out_6, out_7;
-    const int16x8_t v_s0 = vaddq_s16(input_0, input_7);
-    const int16x8_t v_s1 = vaddq_s16(input_1, input_6);
-    const int16x8_t v_s2 = vaddq_s16(input_2, input_5);
-    const int16x8_t v_s3 = vaddq_s16(input_3, input_4);
-    const int16x8_t v_s4 = vsubq_s16(input_3, input_4);
-    const int16x8_t v_s5 = vsubq_s16(input_2, input_5);
-    const int16x8_t v_s6 = vsubq_s16(input_1, input_6);
-    const int16x8_t v_s7 = vsubq_s16(input_0, input_7);
-    // fdct4(step, step);
-    int16x8_t v_x0 = vaddq_s16(v_s0, v_s3);
-    int16x8_t v_x1 = vaddq_s16(v_s1, v_s2);
-    int16x8_t v_x2 = vsubq_s16(v_s1, v_s2);
-    int16x8_t v_x3 = vsubq_s16(v_s0, v_s3);
-    // fdct4(step, step);
-    int32x4_t v_t0_lo = vaddl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t0_hi = vaddl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t1_lo = vsubl_s16(vget_low_s16(v_x0), vget_low_s16(v_x1));
-    int32x4_t v_t1_hi = vsubl_s16(vget_high_s16(v_x0), vget_high_s16(v_x1));
-    int32x4_t v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_24_64);
-    int32x4_t v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_24_64);
-    int32x4_t v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_24_64);
-    int32x4_t v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_24_64);
-    v_t2_lo = vmlal_n_s16(v_t2_lo, vget_low_s16(v_x3), cospi_8_64);
-    v_t2_hi = vmlal_n_s16(v_t2_hi, vget_high_s16(v_x3), cospi_8_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x2), cospi_8_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x2), cospi_8_64);
-    v_t0_lo = vmulq_n_s32(v_t0_lo, cospi_16_64);
-    v_t0_hi = vmulq_n_s32(v_t0_hi, cospi_16_64);
-    v_t1_lo = vmulq_n_s32(v_t1_lo, cospi_16_64);
-    v_t1_hi = vmulq_n_s32(v_t1_hi, cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_0 = vcombine_s16(a, c);  // 00 01 02 03 40 41 42 43
-      out_2 = vcombine_s16(e, g);  // 20 21 22 23 60 61 62 63
-      out_4 = vcombine_s16(b, d);  // 04 05 06 07 44 45 46 47
-      out_6 = vcombine_s16(f, h);  // 24 25 26 27 64 65 66 67
-    }
-    // Stage 2
-    v_x0 = vsubq_s16(v_s6, v_s5);
-    v_x1 = vaddq_s16(v_s6, v_s5);
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x0), cospi_16_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x0), cospi_16_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_16_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_16_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x8_t ab = vcombine_s16(a, b);
-      const int16x8_t cd = vcombine_s16(c, d);
-      // Stage 3
-      v_x0 = vaddq_s16(v_s4, ab);
-      v_x1 = vsubq_s16(v_s4, ab);
-      v_x2 = vsubq_s16(v_s7, cd);
-      v_x3 = vaddq_s16(v_s7, cd);
-    }
-    // Stage 4
-    v_t0_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_4_64);
-    v_t0_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_4_64);
-    v_t0_lo = vmlal_n_s16(v_t0_lo, vget_low_s16(v_x0), cospi_28_64);
-    v_t0_hi = vmlal_n_s16(v_t0_hi, vget_high_s16(v_x0), cospi_28_64);
-    v_t1_lo = vmull_n_s16(vget_low_s16(v_x1), cospi_12_64);
-    v_t1_hi = vmull_n_s16(vget_high_s16(v_x1), cospi_12_64);
-    v_t1_lo = vmlal_n_s16(v_t1_lo, vget_low_s16(v_x2), cospi_20_64);
-    v_t1_hi = vmlal_n_s16(v_t1_hi, vget_high_s16(v_x2), cospi_20_64);
-    v_t2_lo = vmull_n_s16(vget_low_s16(v_x2), cospi_12_64);
-    v_t2_hi = vmull_n_s16(vget_high_s16(v_x2), cospi_12_64);
-    v_t2_lo = vmlsl_n_s16(v_t2_lo, vget_low_s16(v_x1), cospi_20_64);
-    v_t2_hi = vmlsl_n_s16(v_t2_hi, vget_high_s16(v_x1), cospi_20_64);
-    v_t3_lo = vmull_n_s16(vget_low_s16(v_x3), cospi_28_64);
-    v_t3_hi = vmull_n_s16(vget_high_s16(v_x3), cospi_28_64);
-    v_t3_lo = vmlsl_n_s16(v_t3_lo, vget_low_s16(v_x0), cospi_4_64);
-    v_t3_hi = vmlsl_n_s16(v_t3_hi, vget_high_s16(v_x0), cospi_4_64);
-    {
-      const int16x4_t a = vrshrn_n_s32(v_t0_lo, DCT_CONST_BITS);
-      const int16x4_t b = vrshrn_n_s32(v_t0_hi, DCT_CONST_BITS);
-      const int16x4_t c = vrshrn_n_s32(v_t1_lo, DCT_CONST_BITS);
-      const int16x4_t d = vrshrn_n_s32(v_t1_hi, DCT_CONST_BITS);
-      const int16x4_t e = vrshrn_n_s32(v_t2_lo, DCT_CONST_BITS);
-      const int16x4_t f = vrshrn_n_s32(v_t2_hi, DCT_CONST_BITS);
-      const int16x4_t g = vrshrn_n_s32(v_t3_lo, DCT_CONST_BITS);
-      const int16x4_t h = vrshrn_n_s32(v_t3_hi, DCT_CONST_BITS);
-      out_1 = vcombine_s16(a, c);  // 10 11 12 13 50 51 52 53
-      out_3 = vcombine_s16(e, g);  // 30 31 32 33 70 71 72 73
-      out_5 = vcombine_s16(b, d);  // 14 15 16 17 54 55 56 57
-      out_7 = vcombine_s16(f, h);  // 34 35 36 37 74 75 76 77
-    }
-    // transpose 8x8
-    // Can't use transpose_s16_8x8() because the values are arranged in two 4x8
-    // columns.
-    {
-      // 00 01 02 03 40 41 42 43
-      // 10 11 12 13 50 51 52 53
-      // 20 21 22 23 60 61 62 63
-      // 30 31 32 33 70 71 72 73
-      // 04 05 06 07 44 45 46 47
-      // 14 15 16 17 54 55 56 57
-      // 24 25 26 27 64 65 66 67
-      // 34 35 36 37 74 75 76 77
-      const int32x4x2_t r02_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_0), vreinterpretq_s32_s16(out_2));
-      const int32x4x2_t r13_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_1), vreinterpretq_s32_s16(out_3));
-      const int32x4x2_t r46_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_4), vreinterpretq_s32_s16(out_6));
-      const int32x4x2_t r57_s32 =
-          vtrnq_s32(vreinterpretq_s32_s16(out_5), vreinterpretq_s32_s16(out_7));
-      const int16x8x2_t r01_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[0]),
-                    vreinterpretq_s16_s32(r13_s32.val[0]));
-      const int16x8x2_t r23_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r02_s32.val[1]),
-                    vreinterpretq_s16_s32(r13_s32.val[1]));
-      const int16x8x2_t r45_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[0]),
-                    vreinterpretq_s16_s32(r57_s32.val[0]));
-      const int16x8x2_t r67_s16 =
-          vtrnq_s16(vreinterpretq_s16_s32(r46_s32.val[1]),
-                    vreinterpretq_s16_s32(r57_s32.val[1]));
-      input_0 = r01_s16.val[0];
-      input_1 = r01_s16.val[1];
-      input_2 = r23_s16.val[0];
-      input_3 = r23_s16.val[1];
-      input_4 = r45_s16.val[0];
-      input_5 = r45_s16.val[1];
-      input_6 = r67_s16.val[0];
-      input_7 = r67_s16.val[1];
-      // 00 10 20 30 40 50 60 70
-      // 01 11 21 31 41 51 61 71
-      // 02 12 22 32 42 52 62 72
-      // 03 13 23 33 43 53 63 73
-      // 04 14 24 34 44 54 64 74
-      // 05 15 25 35 45 55 65 75
-      // 06 16 26 36 46 56 66 76
-      // 07 17 27 37 47 57 67 77
-    }
+    vpx_fdct8x8_pass1_neon(in);
   }  // for
   {
     // from vpx_dct_sse2.c
     // Post-condition (division by two)
     //    division of two 16 bits signed numbers using shifts
     //    n / 2 = (n - (n >> 15)) >> 1
-    const int16x8_t sign_in0 = vshrq_n_s16(input_0, 15);
-    const int16x8_t sign_in1 = vshrq_n_s16(input_1, 15);
-    const int16x8_t sign_in2 = vshrq_n_s16(input_2, 15);
-    const int16x8_t sign_in3 = vshrq_n_s16(input_3, 15);
-    const int16x8_t sign_in4 = vshrq_n_s16(input_4, 15);
-    const int16x8_t sign_in5 = vshrq_n_s16(input_5, 15);
-    const int16x8_t sign_in6 = vshrq_n_s16(input_6, 15);
-    const int16x8_t sign_in7 = vshrq_n_s16(input_7, 15);
-    input_0 = vhsubq_s16(input_0, sign_in0);
-    input_1 = vhsubq_s16(input_1, sign_in1);
-    input_2 = vhsubq_s16(input_2, sign_in2);
-    input_3 = vhsubq_s16(input_3, sign_in3);
-    input_4 = vhsubq_s16(input_4, sign_in4);
-    input_5 = vhsubq_s16(input_5, sign_in5);
-    input_6 = vhsubq_s16(input_6, sign_in6);
-    input_7 = vhsubq_s16(input_7, sign_in7);
+    const int16x8_t sign_in0 = vshrq_n_s16(in[0], 15);
+    const int16x8_t sign_in1 = vshrq_n_s16(in[1], 15);
+    const int16x8_t sign_in2 = vshrq_n_s16(in[2], 15);
+    const int16x8_t sign_in3 = vshrq_n_s16(in[3], 15);
+    const int16x8_t sign_in4 = vshrq_n_s16(in[4], 15);
+    const int16x8_t sign_in5 = vshrq_n_s16(in[5], 15);
+    const int16x8_t sign_in6 = vshrq_n_s16(in[6], 15);
+    const int16x8_t sign_in7 = vshrq_n_s16(in[7], 15);
+    in[0] = vhsubq_s16(in[0], sign_in0);
+    in[1] = vhsubq_s16(in[1], sign_in1);
+    in[2] = vhsubq_s16(in[2], sign_in2);
+    in[3] = vhsubq_s16(in[3], sign_in3);
+    in[4] = vhsubq_s16(in[4], sign_in4);
+    in[5] = vhsubq_s16(in[5], sign_in5);
+    in[6] = vhsubq_s16(in[6], sign_in6);
+    in[7] = vhsubq_s16(in[7], sign_in7);
     // store results
-    store_s16q_to_tran_low(final_output + 0 * 8, input_0);
-    store_s16q_to_tran_low(final_output + 1 * 8, input_1);
-    store_s16q_to_tran_low(final_output + 2 * 8, input_2);
-    store_s16q_to_tran_low(final_output + 3 * 8, input_3);
-    store_s16q_to_tran_low(final_output + 4 * 8, input_4);
-    store_s16q_to_tran_low(final_output + 5 * 8, input_5);
-    store_s16q_to_tran_low(final_output + 6 * 8, input_6);
-    store_s16q_to_tran_low(final_output + 7 * 8, input_7);
+    store_s16q_to_tran_low(final_output + 0 * 8, in[0]);
+    store_s16q_to_tran_low(final_output + 1 * 8, in[1]);
+    store_s16q_to_tran_low(final_output + 2 * 8, in[2]);
+    store_s16q_to_tran_low(final_output + 3 * 8, in[3]);
+    store_s16q_to_tran_low(final_output + 4 * 8, in[4]);
+    store_s16q_to_tran_low(final_output + 5 * 8, in[5]);
+    store_s16q_to_tran_low(final_output + 6 * 8, in[6]);
+    store_s16q_to_tran_low(final_output + 7 * 8, in[7]);
   }
 }
diff --git a/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm b/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
index 184d218..175ba7f 100644
--- a/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
+++ b/libvpx/vpx_dsp/arm/idct4x4_add_neon.asm
@@ -17,7 +17,7 @@
 
     INCLUDE vpx_dsp/arm/idct_neon.asm.S
 
-    AREA     Block, CODE, READONLY ; name this block of code
+    AREA     Block, CODE, READONLY
 ;void vpx_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int stride)
 ;
 ; r0  int16_t input
diff --git a/libvpx/vpx_dsp/arm/quantize_neon.c b/libvpx/vpx_dsp/arm/quantize_neon.c
index adef5f6..bd7818a 100644
--- a/libvpx/vpx_dsp/arm/quantize_neon.c
+++ b/libvpx/vpx_dsp/arm/quantize_neon.c
@@ -32,8 +32,8 @@
 }
 
 void vpx_quantize_b_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
@@ -42,8 +42,6 @@
   const int16x8_t neg_one = vdupq_n_s16(-1);
   uint16x8_t eob_max;
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Process first 8 values which include a dc component.
   {
@@ -189,7 +187,7 @@
 // Main difference is that zbin values are halved before comparison and dqcoeff
 // values are divided by 2. zbin is rounded but dqcoeff is not.
 void vpx_quantize_b_32x32_neon(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                               int skip_block, const int16_t *zbin_ptr,
+                               const int16_t *zbin_ptr,
                                const int16_t *round_ptr,
                                const int16_t *quant_ptr,
                                const int16_t *quant_shift_ptr,
@@ -202,8 +200,6 @@
   int i;
   (void)scan;
   (void)n_coeffs;  // Because we will always calculate 32*32.
-  (void)skip_block;
-  assert(!skip_block);
 
   // Process first 8 values which include a dc component.
   {
diff --git a/libvpx/vpx_dsp/arm/sad4d_neon.c b/libvpx/vpx_dsp/arm/sad4d_neon.c
index 5c7a0fc..03f716c 100644
--- a/libvpx/vpx_dsp/arm/sad4d_neon.c
+++ b/libvpx/vpx_dsp/arm/sad4d_neon.c
@@ -31,7 +31,7 @@
 static INLINE void sad4x_4d(const uint8_t *const src_ptr, const int src_stride,
                             const uint8_t *const ref_array[4],
                             const int ref_stride, const int height,
-                            uint32_t *const res) {
+                            uint32_t sad_array[4]) {
   int i;
   uint16x8_t abs[2] = { vdupq_n_u16(0), vdupq_n_u16(0) };
 #if !defined(__aarch64__)
@@ -61,26 +61,26 @@
   a[1] = vpadd_u16(vget_low_u16(abs[1]), vget_high_u16(abs[1]));
   r = vpaddlq_u16(vcombine_u16(a[0], a[1]));
 #endif
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 }
 
 void vpx_sad4x4x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, res);
+                        uint32_t sad_array[4]) {
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 4, sad_array);
 }
 
 void vpx_sad4x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, res);
+                        uint32_t sad_array[4]) {
+  sad4x_4d(src_ptr, src_stride, ref_array, ref_stride, 8, sad_array);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
 
 // Can handle 512 pixels' sad sum (such as 16x32 or 32x16)
-static INLINE void sad_512_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                          uint32_t *const res) {
+static INLINE void sad_512_pel_final_neon(const uint16x8_t sum[4],
+                                          uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
@@ -95,21 +95,21 @@
   const uint16x4_t b1 = vpadd_u16(a2, a3);
   const uint32x4_t r = vpaddlq_u16(vcombine_u16(b0, b1));
 #endif
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 }
 
 #if defined(__arm__) || !defined(__ARM_FEATURE_DOTPROD)
 
 // Can handle 1024 pixels' sad sum (such as 32x32)
-static INLINE void sad_1024_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_1024_pel_final_neon(const uint16x8_t sum[4],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint16x8_t a0 = vpaddq_u16(sum[0], sum[1]);
   const uint16x8_t a1 = vpaddq_u16(sum[2], sum[3]);
   const uint32x4_t b0 = vpaddlq_u16(a0);
   const uint32x4_t b1 = vpaddlq_u16(a1);
   const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint16x4_t a0 = vpadd_u16(vget_low_u16(sum[0]), vget_high_u16(sum[0]));
   const uint16x4_t a1 = vpadd_u16(vget_low_u16(sum[1]), vget_high_u16(sum[1]));
@@ -119,13 +119,13 @@
   const uint32x4_t b1 = vpaddlq_u16(vcombine_u16(a2, a3));
   const uint32x2_t c0 = vpadd_u32(vget_low_u32(b0), vget_high_u32(b0));
   const uint32x2_t c1 = vpadd_u32(vget_low_u32(b1), vget_high_u32(b1));
-  vst1q_u32(res, vcombine_u32(c0, c1));
+  vst1q_u32(sad_array, vcombine_u32(c0, c1));
 #endif
 }
 
 // Can handle 2048 pixels' sad sum (such as 32x64 or 64x32)
-static INLINE void sad_2048_pel_final_neon(const uint16x8_t *sum /*[4]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_2048_pel_final_neon(const uint16x8_t sum[4],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -134,7 +134,7 @@
   const uint32x4_t b0 = vpaddq_u32(a0, a1);
   const uint32x4_t b1 = vpaddq_u32(a2, a3);
   const uint32x4_t r = vpaddq_u32(b0, b1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -146,13 +146,13 @@
   const uint32x2_t b3 = vadd_u32(vget_low_u32(a3), vget_high_u32(a3));
   const uint32x2_t c0 = vpadd_u32(b0, b1);
   const uint32x2_t c1 = vpadd_u32(b2, b3);
-  vst1q_u32(res, vcombine_u32(c0, c1));
+  vst1q_u32(sad_array, vcombine_u32(c0, c1));
 #endif
 }
 
 // Can handle 4096 pixels' sad sum (such as 64x64)
-static INLINE void sad_4096_pel_final_neon(const uint16x8_t *sum /*[8]*/,
-                                           uint32_t *const res) {
+static INLINE void sad_4096_pel_final_neon(const uint16x8_t sum[8],
+                                           uint32_t sad_array[4]) {
 #if defined(__aarch64__)
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -169,7 +169,7 @@
   const uint32x4_t c0 = vpaddq_u32(b0, b1);
   const uint32x4_t c1 = vpaddq_u32(b2, b3);
   const uint32x4_t r = vpaddq_u32(c0, c1);
-  vst1q_u32(res, r);
+  vst1q_u32(sad_array, r);
 #else
   const uint32x4_t a0 = vpaddlq_u16(sum[0]);
   const uint32x4_t a1 = vpaddlq_u16(sum[1]);
@@ -189,7 +189,7 @@
   const uint32x2_t c3 = vadd_u32(vget_low_u32(b3), vget_high_u32(b3));
   const uint32x2_t d0 = vpadd_u32(c0, c1);
   const uint32x2_t d1 = vpadd_u32(c2, c3);
-  vst1q_u32(res, vcombine_u32(d0, d1));
+  vst1q_u32(sad_array, vcombine_u32(d0, d1));
 #endif
 }
 
@@ -197,7 +197,7 @@
 
 static INLINE void sad8x_4d(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t *res, const int height) {
+                            uint32_t sad_array[4], const int height) {
   int i, j;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -214,25 +214,25 @@
     }
   }
 
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad8x4x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 4);
+                        uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 4);
 }
 
 void vpx_sad8x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                         const uint8_t *const ref_array[4], int ref_stride,
-                        uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+                        uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
 }
 
 void vpx_sad8x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                         uint32_t sad_array[4]) {
+  sad8x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -249,7 +249,7 @@
 
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -267,7 +267,7 @@
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 #else
@@ -281,7 +281,7 @@
 
 static INLINE void sad16x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -302,27 +302,27 @@
     ref_loop[3] += ref_stride;
   }
 
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 #endif
 
 void vpx_sad16x8x4d_neon(const uint8_t *src_ptr, int src_stride,
                          const uint8_t *const ref_array[4], int ref_stride,
-                         uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 8);
+                         uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 8);
 }
 
 void vpx_sad16x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                          uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 void vpx_sad16x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+                          uint32_t sad_array[4]) {
+  sad16x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
 }
 
 ////////////////////////////////////////////////////////////////////////////////
@@ -332,7 +332,7 @@
 
 static INLINE void sad32x_4d(const uint8_t *src_ptr, int src_stride,
                              const uint8_t *const ref_array[4], int ref_stride,
-                             uint32_t *res, const int height) {
+                             uint32_t sad_array[4], const int height) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -365,25 +365,25 @@
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 16);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 16);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 32);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 32);
 }
 
 void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
-  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, res, 64);
+                          uint32_t sad_array[4]) {
+  sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, sad_array, 64);
 }
 
 #else
@@ -422,26 +422,26 @@
 
 void vpx_sad32x16x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 16, sum);
-  sad_512_pel_final_neon(sum, res);
+  sad_512_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad32x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 32, sum);
-  sad_1024_pel_final_neon(sum, res);
+  sad_1024_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad32x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   uint16x8_t sum[4];
   sad32x_4d(src_ptr, src_stride, ref_array, ref_stride, 64, sum);
-  sad_2048_pel_final_neon(sum, res);
+  sad_2048_pel_final_neon(sum, sad_array);
 }
 
 #endif
@@ -453,7 +453,7 @@
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   uint32x4_t r0, r1;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -497,12 +497,12 @@
 
   r0 = vpaddq_u32(sum[0], sum[1]);
   r1 = vpaddq_u32(sum[2], sum[3]);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   uint32x4_t r0, r1, r2, r3;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
@@ -551,14 +551,14 @@
   r3 = vpaddq_u32(sum[6], sum[7]);
   r0 = vpaddq_u32(r0, r1);
   r1 = vpaddq_u32(r2, r3);
-  vst1q_u32(res, vpaddq_u32(r0, r1));
+  vst1q_u32(sad_array, vpaddq_u32(r0, r1));
 }
 
 #else
 
 void vpx_sad64x32x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -599,12 +599,12 @@
     ref_loop[3] += ref_stride;
   }
 
-  sad_2048_pel_final_neon(sum, res);
+  sad_2048_pel_final_neon(sum, sad_array);
 }
 
 void vpx_sad64x64x4d_neon(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
-                          uint32_t *res) {
+                          uint32_t sad_array[4]) {
   int i;
   const uint8_t *ref_loop[4] = { ref_array[0], ref_array[1], ref_array[2],
                                  ref_array[3] };
@@ -646,7 +646,7 @@
     ref_loop[3] += ref_stride;
   }
 
-  sad_4096_pel_final_neon(sum, res);
+  sad_4096_pel_final_neon(sum, sad_array);
 }
 
 #endif
diff --git a/libvpx/vpx_dsp/arm/sad_neon.c b/libvpx/vpx_dsp/arm/sad_neon.c
index 59567bd..b1509d8 100644
--- a/libvpx/vpx_dsp/arm/sad_neon.c
+++ b/libvpx/vpx_dsp/arm/sad_neon.c
@@ -110,7 +110,7 @@
   return abs;
 }
 
-#define sad8xN(n)                                                              \
+#define SAD8XN(n)                                                              \
   uint32_t vpx_sad8x##n##_neon(const uint8_t *src_ptr, int src_stride,         \
                                const uint8_t *ref_ptr, int ref_stride) {       \
     const uint16x8_t abs = sad8x(src_ptr, src_stride, ref_ptr, ref_stride, n); \
@@ -125,9 +125,9 @@
     return horizontal_add_uint16x8(abs);                                       \
   }
 
-sad8xN(4);
-sad8xN(8);
-sad8xN(16);
+SAD8XN(4)
+SAD8XN(8)
+SAD8XN(16)
 
 static INLINE uint16x8_t sad16x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -167,7 +167,7 @@
   return abs;
 }
 
-#define sad16xN(n)                                                            \
+#define SAD16XN(n)                                                            \
   uint32_t vpx_sad16x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
@@ -183,9 +183,9 @@
     return horizontal_add_uint16x8(abs);                                      \
   }
 
-sad16xN(8);
-sad16xN(16);
-sad16xN(32);
+SAD16XN(8)
+SAD16XN(16)
+SAD16XN(32)
 
 static INLINE uint16x8_t sad32x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -235,7 +235,7 @@
   return abs;
 }
 
-#define sad32xN(n)                                                            \
+#define SAD32XN(n)                                                            \
   uint32_t vpx_sad32x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint16x8_t abs =                                                    \
@@ -251,9 +251,9 @@
     return horizontal_add_uint16x8(abs);                                      \
   }
 
-sad32xN(16);
-sad32xN(32);
-sad32xN(64);
+SAD32XN(16)
+SAD32XN(32)
+SAD32XN(64)
 
 static INLINE uint32x4_t sad64x(const uint8_t *src_ptr, int src_stride,
                                 const uint8_t *ref_ptr, int ref_stride,
@@ -333,7 +333,7 @@
   }
 }
 
-#define sad64xN(n)                                                            \
+#define SAD64XN(n)                                                            \
   uint32_t vpx_sad64x##n##_neon(const uint8_t *src_ptr, int src_stride,       \
                                 const uint8_t *ref_ptr, int ref_stride) {     \
     const uint32x4_t abs =                                                    \
@@ -349,5 +349,5 @@
     return horizontal_add_uint32x4(abs);                                      \
   }
 
-sad64xN(32);
-sad64xN(64);
+SAD64XN(32)
+SAD64XN(64)
diff --git a/libvpx/vpx_dsp/arm/subpel_variance_neon.c b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
index 37bfd1c..a3befdc 100644
--- a/libvpx/vpx_dsp/arm/subpel_variance_neon.c
+++ b/libvpx/vpx_dsp/arm/subpel_variance_neon.c
@@ -97,7 +97,7 @@
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_varianceNxM(n, m)                                         \
+#define SUB_PIXEL_VARIANCENXM(n, m)                                         \
   uint32_t vpx_sub_pixel_variance##n##x##m##_neon(                          \
       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse) {              \
@@ -123,23 +123,23 @@
     return vpx_variance##n##x##m(temp1, n, ref_ptr, ref_stride, sse);       \
   }
 
-sub_pixel_varianceNxM(4, 4);
-sub_pixel_varianceNxM(4, 8);
-sub_pixel_varianceNxM(8, 4);
-sub_pixel_varianceNxM(8, 8);
-sub_pixel_varianceNxM(8, 16);
-sub_pixel_varianceNxM(16, 8);
-sub_pixel_varianceNxM(16, 16);
-sub_pixel_varianceNxM(16, 32);
-sub_pixel_varianceNxM(32, 16);
-sub_pixel_varianceNxM(32, 32);
-sub_pixel_varianceNxM(32, 64);
-sub_pixel_varianceNxM(64, 32);
-sub_pixel_varianceNxM(64, 64);
+SUB_PIXEL_VARIANCENXM(4, 4)
+SUB_PIXEL_VARIANCENXM(4, 8)
+SUB_PIXEL_VARIANCENXM(8, 4)
+SUB_PIXEL_VARIANCENXM(8, 8)
+SUB_PIXEL_VARIANCENXM(8, 16)
+SUB_PIXEL_VARIANCENXM(16, 8)
+SUB_PIXEL_VARIANCENXM(16, 16)
+SUB_PIXEL_VARIANCENXM(16, 32)
+SUB_PIXEL_VARIANCENXM(32, 16)
+SUB_PIXEL_VARIANCENXM(32, 32)
+SUB_PIXEL_VARIANCENXM(32, 64)
+SUB_PIXEL_VARIANCENXM(64, 32)
+SUB_PIXEL_VARIANCENXM(64, 64)
 
 // 4xM filter writes an extra row to fdata because it processes two rows at a
 // time.
-#define sub_pixel_avg_varianceNxM(n, m)                                     \
+#define SUB_PIXEL_AVG_VARIANCENXM(n, m)                                     \
   uint32_t vpx_sub_pixel_avg_variance##n##x##m##_neon(                      \
       const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset,   \
       const uint8_t *ref_ptr, int ref_stride, uint32_t *sse,                \
@@ -169,16 +169,16 @@
     return vpx_variance##n##x##m(temp0, n, ref_ptr, ref_stride, sse);       \
   }
 
-sub_pixel_avg_varianceNxM(4, 4);
-sub_pixel_avg_varianceNxM(4, 8);
-sub_pixel_avg_varianceNxM(8, 4);
-sub_pixel_avg_varianceNxM(8, 8);
-sub_pixel_avg_varianceNxM(8, 16);
-sub_pixel_avg_varianceNxM(16, 8);
-sub_pixel_avg_varianceNxM(16, 16);
-sub_pixel_avg_varianceNxM(16, 32);
-sub_pixel_avg_varianceNxM(32, 16);
-sub_pixel_avg_varianceNxM(32, 32);
-sub_pixel_avg_varianceNxM(32, 64);
-sub_pixel_avg_varianceNxM(64, 32);
-sub_pixel_avg_varianceNxM(64, 64);
+SUB_PIXEL_AVG_VARIANCENXM(4, 4)
+SUB_PIXEL_AVG_VARIANCENXM(4, 8)
+SUB_PIXEL_AVG_VARIANCENXM(8, 4)
+SUB_PIXEL_AVG_VARIANCENXM(8, 8)
+SUB_PIXEL_AVG_VARIANCENXM(8, 16)
+SUB_PIXEL_AVG_VARIANCENXM(16, 8)
+SUB_PIXEL_AVG_VARIANCENXM(16, 16)
+SUB_PIXEL_AVG_VARIANCENXM(16, 32)
+SUB_PIXEL_AVG_VARIANCENXM(32, 16)
+SUB_PIXEL_AVG_VARIANCENXM(32, 32)
+SUB_PIXEL_AVG_VARIANCENXM(32, 64)
+SUB_PIXEL_AVG_VARIANCENXM(64, 32)
+SUB_PIXEL_AVG_VARIANCENXM(64, 64)
diff --git a/libvpx/vpx_dsp/arm/transpose_neon.h b/libvpx/vpx_dsp/arm/transpose_neon.h
index 7523081..c098ad3 100644
--- a/libvpx/vpx_dsp/arm/transpose_neon.h
+++ b/libvpx/vpx_dsp/arm/transpose_neon.h
@@ -1184,6 +1184,45 @@
   *o15 = e7.val[1];
 }
 
+static INLINE void transpose_s16_16x16(int16x8_t *in0, int16x8_t *in1) {
+  int16x8_t t[8];
+
+  // transpose the 4 8x8 quadrants separately but first swap quadrants 2 and 3.
+  t[0] = in0[8];
+  t[1] = in0[9];
+  t[2] = in0[10];
+  t[3] = in0[11];
+  t[4] = in0[12];
+  t[5] = in0[13];
+  t[6] = in0[14];
+  t[7] = in0[15];
+  in0[8] = in1[0];
+  in0[9] = in1[1];
+  in0[10] = in1[2];
+  in0[11] = in1[3];
+  in0[12] = in1[4];
+  in0[13] = in1[5];
+  in0[14] = in1[6];
+  in0[15] = in1[7];
+  in1[0] = t[0];
+  in1[1] = t[1];
+  in1[2] = t[2];
+  in1[3] = t[3];
+  in1[4] = t[4];
+  in1[5] = t[5];
+  in1[6] = t[6];
+  in1[7] = t[7];
+
+  transpose_s16_8x8(&in0[0], &in0[1], &in0[2], &in0[3], &in0[4], &in0[5],
+                    &in0[6], &in0[7]);
+  transpose_s16_8x8(&in0[8], &in0[9], &in0[10], &in0[11], &in0[12], &in0[13],
+                    &in0[14], &in0[15]);
+  transpose_s16_8x8(&in1[0], &in1[1], &in1[2], &in1[3], &in1[4], &in1[5],
+                    &in1[6], &in1[7]);
+  transpose_s16_8x8(&in1[8], &in1[9], &in1[10], &in1[11], &in1[12], &in1[13],
+                    &in1[14], &in1[15]);
+}
+
 static INLINE void load_and_transpose_u8_4x8(const uint8_t *a,
                                              const int a_stride, uint8x8_t *a0,
                                              uint8x8_t *a1, uint8x8_t *a2,
diff --git a/libvpx/vpx_dsp/arm/variance_neon.c b/libvpx/vpx_dsp/arm/variance_neon.c
index 410ce7d..7b93f14 100644
--- a/libvpx/vpx_dsp/arm/variance_neon.c
+++ b/libvpx/vpx_dsp/arm/variance_neon.c
@@ -268,7 +268,7 @@
   variance_neon_w16(src_ptr, src_stride, ref_ptr, ref_stride, 16, 16, sse, sum);
 }
 
-#define varianceNxM(n, m, shift)                                             \
+#define VARIANCENXM(n, m, shift)                                             \
   unsigned int vpx_variance##n##x##m##_neon(                                 \
       const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr,        \
       int ref_stride, unsigned int *sse) {                                   \
@@ -288,16 +288,16 @@
       return *sse - (uint32_t)(((int64_t)sum * sum) >> shift);               \
   }
 
-varianceNxM(4, 4, 4);
-varianceNxM(4, 8, 5);
-varianceNxM(8, 4, 5);
-varianceNxM(8, 8, 6);
-varianceNxM(8, 16, 7);
-varianceNxM(16, 8, 7);
-varianceNxM(16, 16, 8);
-varianceNxM(16, 32, 9);
-varianceNxM(32, 16, 9);
-varianceNxM(32, 32, 10);
+VARIANCENXM(4, 4, 4)
+VARIANCENXM(4, 8, 5)
+VARIANCENXM(8, 4, 5)
+VARIANCENXM(8, 8, 6)
+VARIANCENXM(8, 16, 7)
+VARIANCENXM(16, 8, 7)
+VARIANCENXM(16, 16, 8)
+VARIANCENXM(16, 32, 9)
+VARIANCENXM(32, 16, 9)
+VARIANCENXM(32, 32, 10)
 
 unsigned int vpx_variance32x64_neon(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *ref_ptr, int ref_stride,
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
index 4470b28..c4177c5 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.c
@@ -35,7 +35,7 @@
     }                                                                        \
   }
 
-DEFINE_FILTER(horiz);
-DEFINE_FILTER(avg_horiz);
-DEFINE_FILTER(vert);
-DEFINE_FILTER(avg_vert);
+DEFINE_FILTER(horiz)
+DEFINE_FILTER(avg_horiz)
+DEFINE_FILTER(vert)
+DEFINE_FILTER(avg_vert)
diff --git a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
index b123d1c..f1c7d62 100644
--- a/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
+++ b/libvpx/vpx_dsp/arm/vpx_convolve8_neon_asm.h
@@ -17,13 +17,13 @@
       ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, \
       int x_step_q4, int y0_q4, int y_step_q4, int w, int h);
 
-DECLARE_FILTER(horiz, type1);
-DECLARE_FILTER(avg_horiz, type1);
-DECLARE_FILTER(horiz, type2);
-DECLARE_FILTER(avg_horiz, type2);
-DECLARE_FILTER(vert, type1);
-DECLARE_FILTER(avg_vert, type1);
-DECLARE_FILTER(vert, type2);
-DECLARE_FILTER(avg_vert, type2);
+DECLARE_FILTER(horiz, type1)
+DECLARE_FILTER(avg_horiz, type1)
+DECLARE_FILTER(horiz, type2)
+DECLARE_FILTER(avg_horiz, type2)
+DECLARE_FILTER(vert, type1)
+DECLARE_FILTER(avg_vert, type1)
+DECLARE_FILTER(vert, type2)
+DECLARE_FILTER(avg_vert, type2)
 
 #endif  // VPX_VPX_DSP_ARM_VPX_CONVOLVE8_NEON_ASM_H_
diff --git a/libvpx/vpx_dsp/fastssim.c b/libvpx/vpx_dsp/fastssim.c
index 6ab6f55..4d32a02 100644
--- a/libvpx/vpx_dsp/fastssim.c
+++ b/libvpx/vpx_dsp/fastssim.c
@@ -47,7 +47,7 @@
   unsigned *col_buf;
 };
 
-static void fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
+static int fs_ctx_init(fs_ctx *_ctx, int _w, int _h, int _nlevels) {
   unsigned char *data;
   size_t data_size;
   int lw;
@@ -71,6 +71,7 @@
     lh = (lh + 1) >> 1;
   }
   data = (unsigned char *)malloc(data_size);
+  if (!data) return -1;
   _ctx->level = (fs_level *)data;
   _ctx->nlevels = _nlevels;
   data += _nlevels * sizeof(*_ctx->level);
@@ -95,6 +96,7 @@
     lh = (lh + 1) >> 1;
   }
   _ctx->col_buf = (unsigned *)data;
+  return 0;
 }
 
 static void fs_ctx_clear(fs_ctx *_ctx) { free(_ctx->level); }
@@ -456,7 +458,7 @@
   double ret;
   int l;
   ret = 1;
-  fs_ctx_init(&ctx, _w, _h, FS_NLEVELS);
+  if (fs_ctx_init(&ctx, _w, _h, FS_NLEVELS)) return 99.0;
   fs_downsample_level0(&ctx, _src, _systride, _dst, _dystride, _w, _h, _bd,
                        _shift);
   for (l = 0; l < FS_NLEVELS - 1; l++) {
diff --git a/libvpx/vpx_dsp/loongarch/avg_lsx.c b/libvpx/vpx_dsp/loongarch/avg_lsx.c
new file mode 100644
index 0000000..750c9de
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/avg_lsx.c
@@ -0,0 +1,90 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <stdlib.h>
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/bitdepth_conversion_lsx.h"
+
+void vpx_hadamard_8x8_lsx(const int16_t *src, ptrdiff_t src_stride,
+                          tran_low_t *dst) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  ptrdiff_t src_stride2 = src_stride << 1;
+  ptrdiff_t src_stride3 = src_stride2 + src_stride;
+  ptrdiff_t src_stride4 = src_stride2 << 1;
+  ptrdiff_t src_stride6 = src_stride3 << 1;
+
+  int16_t *src_tmp = (int16_t *)src;
+  src0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src1, src2);
+  src3 = __lsx_vldx(src_tmp, src_stride6);
+  src_tmp += src_stride4;
+  src4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, src_stride2, src_tmp, src_stride4, src5, src6);
+  src7 = __lsx_vldx(src_tmp, src_stride6);
+
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  LSX_TRANSPOSE8x8_H(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, src0, src1,
+                     src2, src3, src4, src5, src6, src7);
+  LSX_BUTTERFLY_8_H(src0, src2, src4, src6, src7, src5, src3, src1, tmp0, tmp2,
+                    tmp4, tmp6, tmp7, tmp5, tmp3, tmp1);
+  LSX_BUTTERFLY_8_H(tmp0, tmp1, tmp4, tmp5, tmp7, tmp6, tmp3, tmp2, src0, src1,
+                    src4, src5, src7, src6, src3, src2);
+  LSX_BUTTERFLY_8_H(src0, src1, src2, src3, src7, src6, src5, src4, tmp0, tmp7,
+                    tmp3, tmp4, tmp5, tmp1, tmp6, tmp2);
+  store_tran_low(tmp0, dst, 0);
+  store_tran_low(tmp1, dst, 8);
+  store_tran_low(tmp2, dst, 16);
+  store_tran_low(tmp3, dst, 24);
+  store_tran_low(tmp4, dst, 32);
+  store_tran_low(tmp5, dst, 40);
+  store_tran_low(tmp6, dst, 48);
+  store_tran_low(tmp7, dst, 56);
+}
+
+void vpx_hadamard_16x16_lsx(const int16_t *src, ptrdiff_t src_stride,
+                            tran_low_t *dst) {
+  int i;
+  __m128i a0, a1, a2, a3, b0, b1, b2, b3;
+
+  /* Rearrange 16x16 to 8x32 and remove stride.
+   * Top left first. */
+  vpx_hadamard_8x8_lsx(src + 0 + 0 * src_stride, src_stride, dst + 0);
+  /* Top right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 0 * src_stride, src_stride, dst + 64);
+  /* Bottom left. */
+  vpx_hadamard_8x8_lsx(src + 0 + 8 * src_stride, src_stride, dst + 128);
+  /* Bottom right. */
+  vpx_hadamard_8x8_lsx(src + 8 + 8 * src_stride, src_stride, dst + 192);
+
+  for (i = 0; i < 64; i += 8) {
+    a0 = load_tran_low(dst);
+    a1 = load_tran_low(dst + 64);
+    a2 = load_tran_low(dst + 128);
+    a3 = load_tran_low(dst + 192);
+
+    LSX_BUTTERFLY_4_H(a0, a2, a3, a1, b0, b2, b3, b1);
+    DUP4_ARG2(__lsx_vsrai_h, b0, 1, b1, 1, b2, 1, b3, 1, b0, b1, b2, b3);
+    LSX_BUTTERFLY_4_H(b0, b1, b3, b2, a0, a1, a3, a2);
+
+    store_tran_low(a0, dst, 0);
+    store_tran_low(a1, dst, 64);
+    store_tran_low(a2, dst, 128);
+    store_tran_low(a3, dst, 192);
+
+    dst += 8;
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c b/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
new file mode 100644
index 0000000..4826260
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/avg_pred_lsx.c
@@ -0,0 +1,83 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+void vpx_comp_avg_pred_lsx(uint8_t *comp_pred, const uint8_t *pred, int width,
+                           int height, const uint8_t *ref, int ref_stride) {
+  // width > 8 || width == 8 || width == 4
+  if (width > 8) {
+    int i, j;
+    for (i = 0; i < height; ++i) {
+      for (j = 0; j < width; j += 16) {
+        __m128i p, r, avg;
+
+        p = __lsx_vld(pred + j, 0);
+        r = __lsx_vld(ref + j, 0);
+        avg = __lsx_vavgr_bu(p, r);
+        __lsx_vst(avg, comp_pred + j, 0);
+      }
+      comp_pred += width;
+      pred += width;
+      ref += ref_stride;
+    }
+  } else if (width == 8) {
+    int i = height * width;
+    do {
+      __m128i p, r, r_0, r_1;
+
+      p = __lsx_vld(pred, 0);
+      r_0 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r_1 = __lsx_vld(ref, 0);
+      ref += ref_stride;
+      r = __lsx_vilvl_d(r_1, r_0);
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+
+      pred += 16;
+      comp_pred += 16;
+      i -= 16;
+    } while (i);
+  } else {  // width = 4
+    int i = height * width;
+    assert(width == 4);
+    do {
+      __m128i p, r, r_0, r_1, r_2, r_3;
+      p = __lsx_vld(pred, 0);
+
+      if (width == ref_stride) {
+        r = __lsx_vld(ref, 0);
+        ref += 16;
+      } else {
+        r_0 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_1 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_2 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        r_3 = __lsx_vld(ref, 0);
+        ref += ref_stride;
+        DUP2_ARG2(__lsx_vilvl_w, r_1, r_0, r_3, r_2, r_0, r_2);
+        r = __lsx_vilvl_d(r_2, r_0);
+      }
+      r = __lsx_vavgr_bu(p, r);
+
+      __lsx_vst(r, comp_pred, 0);
+      comp_pred += 16;
+      pred += 16;
+      i -= 16;
+    } while (i);
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h b/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
new file mode 100644
index 0000000..b0db1e9
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/bitdepth_conversion_lsx.h
@@ -0,0 +1,41 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx/vpx_integer.h"
+#include "vpx_dsp/vpx_dsp_common.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i load_tran_low(const tran_low_t *s) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m = __lsx_vld(s, 0);
+  __m128i v1_m = __lsx_vld(s + 4, 0);
+  return __lsx_vsrlni_h_w(v0_m, v1_m, 0);
+#else
+  return __lsx_vld(s, 0);
+#endif
+}
+
+static INLINE void store_tran_low(__m128i v, tran_low_t *s, int32_t c) {
+#if CONFIG_VP9_HIGHBITDEPTH
+  __m128i v0_m, v1_m;
+  v1_m = __lsx_vexth_w_h(v);
+  v0_m = __lsx_vsllwil_w_h(v, 0);
+  __lsx_vst(v0_m, s + c, 0);
+  __lsx_vst(v1_m, s + c + 4, 0);
+#else
+  __lsx_vst(v, s + c, 0);
+#endif
+}
+
+#endif  // VPX_VPX_DSP_LOONGARCH_BITDEPTH_CONVERSION_LSX_H_
diff --git a/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c b/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
new file mode 100644
index 0000000..9bb3877
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/fwd_dct32x32_lsx.c
@@ -0,0 +1,1176 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+#include "vpx_dsp/fwd_txfm.h"
+
+#define UNPCK_SH_SW(in, out0, out1)  \
+  do {                               \
+    out0 = __lsx_vsllwil_w_h(in, 0); \
+    out1 = __lsx_vexth_w_h(in);      \
+  } while (0)
+
+static void fdct8x32_1d_column_load_butterfly(const int16_t *input,
+                                              int32_t src_stride,
+                                              int16_t *temp_buff) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i step0, step1, step2, step3;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+  __m128i step0_1, step1_1, step2_1, step3_1;
+
+  int32_t stride = src_stride << 1;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  const int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp = input + (src_stride * 24);
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 0);
+  __lsx_vst(step1, temp_buff, 16);
+  __lsx_vst(step2, temp_buff, 32);
+  __lsx_vst(step3, temp_buff, 48);
+
+  __lsx_vst(in4, temp_buff, 448);
+  __lsx_vst(in5, temp_buff, 464);
+  __lsx_vst(in6, temp_buff, 480);
+  __lsx_vst(in7, temp_buff, 496);
+
+  __lsx_vst(step0_1, temp_buff, 64);
+  __lsx_vst(step1_1, temp_buff, 80);
+  __lsx_vst(step2_1, temp_buff, 96);
+  __lsx_vst(step3_1, temp_buff, 112);
+
+  __lsx_vst(in4_1, temp_buff, 384);
+  __lsx_vst(in5_1, temp_buff, 400);
+  __lsx_vst(in6_1, temp_buff, 416);
+  __lsx_vst(in7_1, temp_buff, 432);
+
+  /* 3rd and 4th set */
+  input_tmp = input + (src_stride * 8);
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1, in2);
+  in3 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in0_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in1_1, in2_1);
+  in3_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4_1 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5_1, in6_1);
+  in7_1 = __lsx_vldx(input_tmp, stride3);
+
+  input_tmp += stride2;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, stride, input_tmp, stride2, in5, in6);
+  in7 = __lsx_vldx(input_tmp, stride3);
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in0_1, 2, in1_1, 2, in2_1, 2, in3_1, 2, in0_1, in1_1,
+            in2_1, in3_1);
+  DUP4_ARG2(__lsx_vslli_h, in4_1, 2, in5_1, 2, in6_1, 2, in7_1, 2, in4_1, in5_1,
+            in6_1, in7_1);
+
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, step0, step1, step2,
+                    step3, in4, in5, in6, in7);
+  LSX_BUTTERFLY_8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                    step0_1, step1_1, step2_1, step3_1, in4_1, in5_1, in6_1,
+                    in7_1);
+
+  __lsx_vst(step0, temp_buff, 128);
+  __lsx_vst(step1, temp_buff, 144);
+  __lsx_vst(step2, temp_buff, 160);
+  __lsx_vst(step3, temp_buff, 176);
+
+  __lsx_vst(in4, temp_buff, 320);
+  __lsx_vst(in5, temp_buff, 336);
+  __lsx_vst(in6, temp_buff, 352);
+  __lsx_vst(in7, temp_buff, 368);
+
+  __lsx_vst(step0_1, temp_buff, 192);
+  __lsx_vst(step1_1, temp_buff, 208);
+  __lsx_vst(step2_1, temp_buff, 224);
+  __lsx_vst(step3_1, temp_buff, 240);
+
+  __lsx_vst(in4_1, temp_buff, 256);
+  __lsx_vst(in5_1, temp_buff, 272);
+  __lsx_vst(in6_1, temp_buff, 288);
+  __lsx_vst(in7_1, temp_buff, 304);
+}
+
+static void fdct8x32_1d_column_even_store(int16_t *input, int16_t *temp) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i temp0, temp1;
+
+  /* fdct even */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+  LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in12, in13, in14, in15, vec0, vec1,
+                    vec2, vec3, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  LSX_BUTTERFLY_8_H(in4, in5, in6, in7, in8, in9, in10, in11, vec4, vec5, vec6,
+                    vec7, in8, in9, in10, in11);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 0);
+  __lsx_vst(temp1, temp, 1024);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 512);
+  __lsx_vst(temp1, temp, 1536);
+
+  DUP4_ARG2(__lsx_vsub_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, vec7,
+            vec6, vec5, vec4);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 256);
+  __lsx_vst(temp1, temp, 1792);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1280);
+  __lsx_vst(temp1, temp, 768);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 128);
+  __lsx_vst(temp1, temp, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 1152);
+  __lsx_vst(temp1, temp, 896);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 640);
+  __lsx_vst(temp1, temp, 1408);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT32_POSTPROC_2V_POS_H(temp0, temp1);
+  __lsx_vst(temp0, temp, 384);
+  __lsx_vst(temp1, temp, 1664);
+}
+
+static void fdct8x32_1d_column_odd_store(int16_t *input, int16_t *temp_ptr) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 160, input, 176, in20, in21,
+            in26, in27);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 192, input, 208, in18, in19,
+            in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, input, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, input, 80);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, input, 160);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, input, 176);
+
+  in21 = __lsx_vadd_h(in18, in21);
+  in20 = __lsx_vadd_h(in19, in20);
+  in27 = __lsx_vadd_h(in28, in27);
+  in26 = __lsx_vadd_h(in29, in26);
+
+  DUP4_ARG2(__lsx_vld, input, 96, input, 112, input, 128, input, 144, in22,
+            in23, in24, in25);
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 224, input, 240, in16, in17,
+            in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, input, 32);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, input, 48);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, input, 192);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, input, 208);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 0);
+  __lsx_vst(vec4, temp_ptr, 1920);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 896);
+  __lsx_vst(vec4, temp_ptr, 1024);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 1408);
+  __lsx_vst(vec5, temp_ptr, 512);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec4, temp_ptr, 384);
+  __lsx_vst(vec5, temp_ptr, 1536);
+
+  DUP4_ARG2(__lsx_vld, input, 32, input, 48, input, 64, input, 80, in22, in23,
+            in20, in21);
+  DUP4_ARG2(__lsx_vld, input, 160, input, 176, input, 192, input, 208, in26,
+            in27, in24, in25);
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1664);
+  __lsx_vst(vec4, temp_ptr, 256);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 640);
+  __lsx_vst(vec4, temp_ptr, 1280);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 1152);
+  __lsx_vst(vec4, temp_ptr, 768);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT32_POSTPROC_2V_POS_H(vec5, vec4);
+  __lsx_vst(vec5, temp_ptr, 128);
+  __lsx_vst(vec4, temp_ptr, 1792);
+}
+
+static void fdct8x32_1d_column(const int16_t *input, int32_t src_stride,
+                               int16_t *tmp_buf, int16_t *tmp_buf_big) {
+  fdct8x32_1d_column_load_butterfly(input, src_stride, tmp_buf);
+  fdct8x32_1d_column_even_store(tmp_buf, tmp_buf_big);
+  fdct8x32_1d_column_odd_store(tmp_buf + 128, (tmp_buf_big + 32));
+}
+
+static void fdct8x32_1d_row_load_butterfly(int16_t *temp_buff,
+                                           int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i step0, step1, step2, step3, step4, step5, step6, step7;
+
+  DUP4_ARG2(__lsx_vld, temp_buff, 0, temp_buff, 64, temp_buff, 128, temp_buff,
+            192, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 256, temp_buff, 320, temp_buff, 384,
+            temp_buff, 448, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 48, temp_buff, 112, temp_buff, 176, temp_buff,
+            240, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 304, temp_buff, 368, temp_buff, 432,
+            temp_buff, 496, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 0);
+  __lsx_vst(step1, output, 16);
+  __lsx_vst(step2, output, 32);
+  __lsx_vst(step3, output, 48);
+  __lsx_vst(step4, output, 64);
+  __lsx_vst(step5, output, 80);
+  __lsx_vst(step6, output, 96);
+  __lsx_vst(step7, output, 112);
+
+  __lsx_vst(in8, output, 384);
+  __lsx_vst(in9, output, 400);
+  __lsx_vst(in10, output, 416);
+  __lsx_vst(in11, output, 432);
+  __lsx_vst(in12, output, 448);
+  __lsx_vst(in13, output, 464);
+  __lsx_vst(in14, output, 480);
+  __lsx_vst(in15, output, 496);
+
+  /* 2nd set */
+  DUP4_ARG2(__lsx_vld, temp_buff, 16, temp_buff, 80, temp_buff, 144, temp_buff,
+            208, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vld, temp_buff, 272, temp_buff, 336, temp_buff, 400,
+            temp_buff, 464, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vld, temp_buff, 32, temp_buff, 96, temp_buff, 160, temp_buff,
+            224, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, temp_buff, 288, temp_buff, 352, temp_buff, 416,
+            temp_buff, 480, in12, in13, in14, in15);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, step0, step1, step2, step3,
+                     step4, step5, step6, step7, in8, in9, in10, in11, in12,
+                     in13, in14, in15);
+
+  __lsx_vst(step0, output, 128);
+  __lsx_vst(step1, output, 144);
+  __lsx_vst(step2, output, 160);
+  __lsx_vst(step3, output, 176);
+  __lsx_vst(step4, output, 192);
+  __lsx_vst(step5, output, 208);
+  __lsx_vst(step6, output, 224);
+  __lsx_vst(step7, output, 240);
+
+  __lsx_vst(in8, output, 256);
+  __lsx_vst(in9, output, 272);
+  __lsx_vst(in10, output, 288);
+  __lsx_vst(in11, output, 304);
+  __lsx_vst(in12, output, 320);
+  __lsx_vst(in13, output, 336);
+  __lsx_vst(in14, output, 352);
+  __lsx_vst(in15, output, 368);
+}
+
+static void fdct8x32_1d_row_even_4x(int16_t *input, int16_t *interm_ptr,
+                                    int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i vec0_l, vec1_l, vec2_l, vec3_l, vec4_l, vec5_l, vec6_l, vec7_l;
+  __m128i vec0_r, vec1_r, vec2_r, vec3_r, vec4_r, vec5_r, vec6_r, vec7_r;
+  __m128i tmp0_w, tmp1_w, tmp2_w, tmp3_w;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 16, input, 32, input, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 64, input, 80, input, 96, input, 112, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 144, input, 160, input, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 192, input, 208, input, 224, input, 240, in12,
+            in13, in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  __lsx_vst(vec0, interm_ptr, 0);
+  __lsx_vst(vec1, interm_ptr, 16);
+  __lsx_vst(vec2, interm_ptr, 32);
+  __lsx_vst(vec3, interm_ptr, 48);
+  __lsx_vst(vec4, interm_ptr, 64);
+  __lsx_vst(vec5, interm_ptr, 80);
+  __lsx_vst(vec6, interm_ptr, 96);
+  __lsx_vst(vec7, interm_ptr, 112);
+
+  __lsx_vst(in8, interm_ptr, 128);
+  __lsx_vst(in9, interm_ptr, 144);
+  __lsx_vst(in10, interm_ptr, 160);
+  __lsx_vst(in11, interm_ptr, 176);
+  __lsx_vst(in12, interm_ptr, 192);
+  __lsx_vst(in13, interm_ptr, 208);
+  __lsx_vst(in14, interm_ptr, 224);
+  __lsx_vst(in15, interm_ptr, 240);
+
+  /* Stage 3 */
+  UNPCK_SH_SW(vec0, vec0_l, vec0_r);
+  UNPCK_SH_SW(vec1, vec1_l, vec1_r);
+  UNPCK_SH_SW(vec2, vec2_l, vec2_r);
+  UNPCK_SH_SW(vec3, vec3_l, vec3_r);
+  UNPCK_SH_SW(vec4, vec4_l, vec4_r);
+  UNPCK_SH_SW(vec5, vec5_l, vec5_r);
+  UNPCK_SH_SW(vec6, vec6_l, vec6_r);
+  UNPCK_SH_SW(vec7, vec7_l, vec7_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_r, vec7_r, vec1_r, vec6_r, vec2_r, vec5_r,
+            vec3_r, vec4_r, tmp0_w, tmp1_w, tmp2_w, tmp3_w);
+  LSX_BUTTERFLY_4_W(tmp0_w, tmp1_w, tmp2_w, tmp3_w, vec4_r, vec6_r, vec7_r,
+                    vec5_r);
+  DUP4_ARG2(__lsx_vadd_w, vec0_l, vec7_l, vec1_l, vec6_l, vec2_l, vec5_l,
+            vec3_l, vec4_l, vec0_r, vec1_r, vec2_r, vec3_r);
+
+  tmp3_w = __lsx_vadd_w(vec0_r, vec3_r);
+  vec0_r = __lsx_vsub_w(vec0_r, vec3_r);
+  vec3_r = __lsx_vadd_w(vec1_r, vec2_r);
+  vec1_r = __lsx_vsub_w(vec1_r, vec2_r);
+
+  DOTP_CONST_PAIR_W(vec4_r, vec6_r, tmp3_w, vec3_r, cospi_16_64, cospi_16_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 16);
+
+  DOTP_CONST_PAIR_W(vec5_r, vec7_r, vec0_r, vec1_r, cospi_24_64, cospi_8_64,
+                    vec4_r, tmp3_w, vec6_r, vec3_r);
+  FDCT32_POSTPROC_NEG_W(vec4_r);
+  FDCT32_POSTPROC_NEG_W(tmp3_w);
+  FDCT32_POSTPROC_NEG_W(vec6_r);
+  FDCT32_POSTPROC_NEG_W(vec3_r);
+  DUP2_ARG2(__lsx_vpickev_h, vec4_r, tmp3_w, vec6_r, vec3_r, vec4, vec5);
+  __lsx_vst(vec5, out, 32);
+  __lsx_vst(vec4, out, 48);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 0, interm_ptr, 16, interm_ptr, 32,
+            interm_ptr, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 64, interm_ptr, 80, interm_ptr, 96,
+            interm_ptr, 112, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 64);
+  __lsx_vst(in5, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 80);
+  __lsx_vst(in5, out, 96);
+
+  DUP4_ARG2(__lsx_vld, interm_ptr, 128, interm_ptr, 144, interm_ptr, 160,
+            interm_ptr, 176, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, interm_ptr, 192, interm_ptr, 208, interm_ptr, 224,
+            interm_ptr, 240, in12, in13, in14, in15);
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 128);
+  __lsx_vst(in5, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 144);
+  __lsx_vst(in5, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  tmp0_w = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(tmp0_w, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, in5, in4);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 160);
+  __lsx_vst(in5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, in4, in5);
+  FDCT_POSTPROC_2V_NEG_H(in4, in5);
+  __lsx_vst(in4, out, 192);
+  __lsx_vst(in5, out, 176);
+}
+
+static void fdct8x32_1d_row_even(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+  LSX_BUTTERFLY_4_H(in0, in1, in2, in3, temp0, in4, in1, in0);
+  DOTP_CONST_PAIR(temp0, in4, cospi_16_64, cospi_16_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5)
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  FDCT_POSTPROC_2V_NEG_H(temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd(int16_t *temp, int16_t *interm_ptr,
+                                int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31, vec4, vec5;
+  __m128i tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  DUP2_ARG2(__lsx_vadd_h, in28, in29, in31, in30, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  FDCT_POSTPROC_2V_NEG_H(vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct8x32_1d_row_transpose_store(int16_t *temp, int16_t *output) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1;
+
+  /* 1st set */
+  in0 = __lsx_vld(temp, 0);
+  in4 = __lsx_vld(temp, 64);
+  in2 = __lsx_vld(temp, 128);
+  in6 = __lsx_vld(temp, 192);
+  in1 = __lsx_vld(temp, 256);
+  in7 = __lsx_vld(temp, 304);
+  in3 = __lsx_vld(temp, 384);
+  in5 = __lsx_vld(temp, 432);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  /* 2nd set */
+  in0_1 = __lsx_vld(temp, 32);
+  in1_1 = __lsx_vld(temp, 464);
+  in2_1 = __lsx_vld(temp, 160);
+  in3_1 = __lsx_vld(temp, 336);
+  in4_1 = __lsx_vld(temp, 96);
+  in5_1 = __lsx_vld(temp, 352);
+  in6_1 = __lsx_vld(temp, 224);
+  in7_1 = __lsx_vld(temp, 480);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 64);
+  __lsx_vst(in2, output, 128);
+  __lsx_vst(in3, output, 192);
+  __lsx_vst(in4, output, 256);
+  __lsx_vst(in5, output, 320);
+  __lsx_vst(in6, output, 384);
+  __lsx_vst(in7, output, 448);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  /* 3rd set */
+  in0 = __lsx_vld(temp, 16);
+  in1 = __lsx_vld(temp, 272);
+  in2 = __lsx_vld(temp, 144);
+  in3 = __lsx_vld(temp, 400);
+  in4 = __lsx_vld(temp, 80);
+  in5 = __lsx_vld(temp, 416);
+  in6 = __lsx_vld(temp, 208);
+  in7 = __lsx_vld(temp, 288);
+
+  __lsx_vst(in0_1, output, 16);
+  __lsx_vst(in1_1, output, 80);
+  __lsx_vst(in2_1, output, 144);
+  __lsx_vst(in3_1, output, 208);
+  __lsx_vst(in4_1, output, 272);
+  __lsx_vst(in5_1, output, 336);
+  __lsx_vst(in6_1, output, 400);
+  __lsx_vst(in7_1, output, 464);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(in3, output, 224);
+  __lsx_vst(in4, output, 288);
+  __lsx_vst(in5, output, 352);
+  __lsx_vst(in6, output, 416);
+  __lsx_vst(in7, output, 480);
+
+  /* 4th set */
+  in0_1 = __lsx_vld(temp, 48);
+  in1_1 = __lsx_vld(temp, 448);
+  in2_1 = __lsx_vld(temp, 176);
+  in3_1 = __lsx_vld(temp, 320);
+  in4_1 = __lsx_vld(temp, 112);
+  in5_1 = __lsx_vld(temp, 368);
+  in6_1 = __lsx_vld(temp, 240);
+  in7_1 = __lsx_vld(temp, 496);
+
+  LSX_TRANSPOSE8x8_H(in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1,
+                     in0_1, in1_1, in2_1, in3_1, in4_1, in5_1, in6_1, in7_1);
+
+  __lsx_vst(in0_1, output, 48);
+  __lsx_vst(in1_1, output, 112);
+  __lsx_vst(in2_1, output, 176);
+  __lsx_vst(in3_1, output, 240);
+  __lsx_vst(in4_1, output, 304);
+  __lsx_vst(in5_1, output, 368);
+  __lsx_vst(in6_1, output, 432);
+  __lsx_vst(in7_1, output, 496);
+}
+
+static void fdct32x8_1d_row(int16_t *temp, int16_t *temp_buf, int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(temp, temp_buf);
+  fdct8x32_1d_row_even(temp_buf, temp_buf);
+  fdct8x32_1d_row_odd(temp_buf + 128, temp, temp_buf + 128);
+  fdct8x32_1d_row_transpose_store(temp_buf, output);
+}
+
+static void fdct32x8_1d_row_4x(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_4x(tmp_buf, tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_odd(tmp_buf + 128, tmp_buf_big, tmp_buf + 128);
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
+                       tmp_buf_big + (8 * i));
+  }
+
+  /* row transform */
+  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);
+
+  /* row transform */
+  for (i = 1; i < 4; ++i) {
+    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
+  }
+}
+
+static void fdct8x32_1d_row_even_rd(int16_t *temp, int16_t *out) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, temp0, temp1;
+
+  /* fdct32 even */
+  /* stage 2 */
+  DUP4_ARG2(__lsx_vld, temp, 0, temp, 16, temp, 32, temp, 48, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, temp, 64, temp, 80, temp, 96, temp, 112, in4, in5, in6,
+            in7);
+  DUP4_ARG2(__lsx_vld, temp, 128, temp, 144, temp, 160, temp, 176, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, temp, 192, temp, 208, temp, 224, temp, 240, in12, in13,
+            in14, in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, vec0, vec1, vec2, vec3, vec4,
+                     vec5, vec6, vec7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+
+  FDCT_POSTPROC_2V_NEG_H(vec0, vec1);
+  FDCT_POSTPROC_2V_NEG_H(vec2, vec3);
+  FDCT_POSTPROC_2V_NEG_H(vec4, vec5);
+  FDCT_POSTPROC_2V_NEG_H(vec6, vec7);
+  FDCT_POSTPROC_2V_NEG_H(in8, in9);
+  FDCT_POSTPROC_2V_NEG_H(in10, in11);
+  FDCT_POSTPROC_2V_NEG_H(in12, in13);
+  FDCT_POSTPROC_2V_NEG_H(in14, in15);
+
+  /* Stage 3 */
+  DUP4_ARG2(__lsx_vadd_h, vec0, vec7, vec1, vec6, vec2, vec5, vec3, vec4, in0,
+            in1, in2, in3);
+
+  temp0 = __lsx_vadd_h(in0, in3);
+  in0 = __lsx_vsub_h(in0, in3);
+  in3 = __lsx_vadd_h(in1, in2);
+  in1 = __lsx_vsub_h(in1, in2);
+
+  DOTP_CONST_PAIR(temp0, in3, cospi_16_64, cospi_16_64, temp1, temp0);
+  __lsx_vst(temp0, out, 0);
+  __lsx_vst(temp1, out, 16);
+
+  DOTP_CONST_PAIR(in0, in1, cospi_24_64, cospi_8_64, temp1, temp0);
+  __lsx_vst(temp0, out, 32);
+  __lsx_vst(temp1, out, 48);
+
+  DUP4_ARG2(__lsx_vsub_h, vec3, vec4, vec2, vec5, vec1, vec6, vec0, vec7, vec4,
+            vec5, vec6, vec7);
+  DOTP_CONST_PAIR(vec6, vec5, cospi_16_64, cospi_16_64, vec5, vec6);
+  DUP2_ARG2(__lsx_vadd_h, vec4, vec5, vec7, vec6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_28_64, cospi_4_64, temp1, temp0);
+  __lsx_vst(temp0, out, 64);
+  __lsx_vst(temp1, out, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, vec4, vec5, vec7, vec6, vec4, vec7);
+  DOTP_CONST_PAIR(vec7, vec4, cospi_12_64, cospi_20_64, temp1, temp0);
+  __lsx_vst(temp0, out, 80);
+  __lsx_vst(temp1, out, 96);
+
+  DOTP_CONST_PAIR(in13, in10, cospi_16_64, cospi_16_64, vec2, vec5);
+  DOTP_CONST_PAIR(in12, in11, cospi_16_64, cospi_16_64, vec3, vec4);
+  DUP4_ARG2(__lsx_vadd_h, in8, vec3, in9, vec2, in14, vec5, in15, vec4, in0,
+            vec1, vec6, in2);
+  DOTP_CONST_PAIR(vec6, vec1, cospi_24_64, cospi_8_64, in1, in3);
+  DUP2_ARG2(__lsx_vadd_h, in0, in1, in2, in3, vec0, vec7);
+  DOTP_CONST_PAIR(vec7, vec0, cospi_30_64, cospi_2_64, temp1, temp0);
+  __lsx_vst(temp0, out, 128);
+  __lsx_vst(temp1, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in0, in1, in2, in3, in0, in2);
+  DOTP_CONST_PAIR(in2, in0, cospi_14_64, cospi_18_64, temp1, temp0);
+  __lsx_vst(temp0, out, 144);
+  __lsx_vst(temp1, out, 224);
+
+  DUP2_ARG2(__lsx_vsub_h, in9, vec2, in14, vec5, vec2, vec5);
+  temp0 = __lsx_vneg_h(vec2);
+  DOTP_CONST_PAIR(temp0, vec5, cospi_24_64, cospi_8_64, in2, in1);
+  DUP4_ARG2(__lsx_vsub_h, in8, vec3, in15, vec4, in3, in2, in0, in1, in3, in0,
+            vec2, vec5);
+  DOTP_CONST_PAIR(vec5, vec2, cospi_22_64, cospi_10_64, temp1, temp0);
+  __lsx_vst(temp0, out, 160);
+  __lsx_vst(temp1, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in3, in2, in0, in1, vec3, vec4);
+  DOTP_CONST_PAIR(vec4, vec3, cospi_6_64, cospi_26_64, temp0, temp1);
+  __lsx_vst(temp0, out, 192);
+  __lsx_vst(temp1, out, 176);
+}
+
+static void fdct8x32_1d_row_odd_rd(int16_t *temp, int16_t *interm_ptr,
+                                   int16_t *out) {
+  __m128i in16, in17, in18, in19, in20, in21, in22, in23;
+  __m128i in24, in25, in26, in27, in28, in29, in30, in31;
+  __m128i vec4, vec5, tmp0, tmp1;
+
+  in20 = __lsx_vld(temp, 64);
+  in21 = __lsx_vld(temp, 80);
+  in26 = __lsx_vld(temp, 160);
+  in27 = __lsx_vld(temp, 176);
+
+  DOTP_CONST_PAIR(in27, in20, cospi_16_64, cospi_16_64, in20, in27);
+  DOTP_CONST_PAIR(in26, in21, cospi_16_64, cospi_16_64, in21, in26);
+
+  FDCT_POSTPROC_2V_NEG_H(in20, in21);
+  FDCT_POSTPROC_2V_NEG_H(in26, in27);
+
+  in18 = __lsx_vld(temp, 32);
+  in19 = __lsx_vld(temp, 48);
+  in28 = __lsx_vld(temp, 192);
+  in29 = __lsx_vld(temp, 208);
+
+  FDCT_POSTPROC_2V_NEG_H(in18, in19);
+  FDCT_POSTPROC_2V_NEG_H(in28, in29);
+
+  vec4 = __lsx_vsub_h(in19, in20);
+  __lsx_vst(vec4, interm_ptr, 64);
+  vec4 = __lsx_vsub_h(in18, in21);
+  __lsx_vst(vec4, interm_ptr, 176);
+  vec4 = __lsx_vsub_h(in29, in26);
+  __lsx_vst(vec4, interm_ptr, 128);
+  vec4 = __lsx_vsub_h(in28, in27);
+  __lsx_vst(vec4, interm_ptr, 112);
+
+  DUP4_ARG2(__lsx_vadd_h, in18, in21, in19, in20, in28, in27, in29, in26, in21,
+            in20, in27, in26);
+
+  in22 = __lsx_vld(temp, 96);
+  in23 = __lsx_vld(temp, 112);
+  in24 = __lsx_vld(temp, 128);
+  in25 = __lsx_vld(temp, 144);
+
+  DOTP_CONST_PAIR(in25, in22, cospi_16_64, cospi_16_64, in22, in25);
+  DOTP_CONST_PAIR(in24, in23, cospi_16_64, cospi_16_64, in23, in24);
+  FDCT_POSTPROC_2V_NEG_H(in22, in23);
+  FDCT_POSTPROC_2V_NEG_H(in24, in25);
+
+  in16 = __lsx_vld(temp, 0);
+  in17 = __lsx_vld(temp, 16);
+  in30 = __lsx_vld(temp, 224);
+  in31 = __lsx_vld(temp, 240);
+
+  FDCT_POSTPROC_2V_NEG_H(in16, in17);
+  FDCT_POSTPROC_2V_NEG_H(in30, in31);
+
+  vec4 = __lsx_vsub_h(in17, in22);
+  __lsx_vst(vec4, interm_ptr, 80);
+  vec4 = __lsx_vsub_h(in30, in25);
+  __lsx_vst(vec4, interm_ptr, 96);
+  vec4 = __lsx_vsub_h(in31, in24);
+  __lsx_vst(vec4, interm_ptr, 144);
+  vec4 = __lsx_vsub_h(in16, in23);
+  __lsx_vst(vec4, interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in16, in23, in17, in22, in30, in25, in31, in24, in16,
+            in17, in30, in31);
+  DOTP_CONST_PAIR(in26, in21, cospi_24_64, cospi_8_64, in18, in29);
+  DOTP_CONST_PAIR(in27, in20, cospi_24_64, cospi_8_64, in19, in28);
+  DUP4_ARG2(__lsx_vadd_h, in16, in19, in17, in18, in30, in29, in31, in28, in27,
+            in22, in21, in25);
+  DOTP_CONST_PAIR(in21, in22, cospi_28_64, cospi_4_64, in26, in24);
+  DUP2_ARG2(__lsx_vadd_h, in27, in26, in25, in24, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_31_64, cospi_1_64, vec4, vec5);
+  __lsx_vst(vec5, out, 0);
+  __lsx_vst(vec4, out, 240);
+
+  DUP2_ARG2(__lsx_vsub_h, in27, in26, in25, in24, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_15_64, cospi_17_64, vec5, vec4);
+  __lsx_vst(vec5, out, 224);
+  __lsx_vst(vec4, out, 16);
+
+  DUP4_ARG2(__lsx_vsub_h, in17, in18, in16, in19, in31, in28, in30, in29, in23,
+            in26, in24, in20);
+  tmp0 = __lsx_vneg_h(in23);
+  DOTP_CONST_PAIR(tmp0, in20, cospi_28_64, cospi_4_64, in27, in25);
+  DUP2_ARG2(__lsx_vsub_h, in26, in27, in24, in25, in23, in20);
+  DOTP_CONST_PAIR(in20, in23, cospi_23_64, cospi_9_64, vec4, vec5);
+  __lsx_vst(vec4, out, 32);
+  __lsx_vst(vec5, out, 208);
+
+  DUP2_ARG2(__lsx_vadd_h, in26, in27, in24, in25, in22, in21);
+  DOTP_CONST_PAIR(in21, in22, cospi_7_64, cospi_25_64, vec4, vec5);
+  __lsx_vst(vec4, out, 48);
+  __lsx_vst(vec5, out, 192);
+
+  in20 = __lsx_vld(interm_ptr, 64);
+  in21 = __lsx_vld(interm_ptr, 176);
+  in27 = __lsx_vld(interm_ptr, 112);
+  in26 = __lsx_vld(interm_ptr, 128);
+
+  in16 = in20;
+  in17 = in21;
+  DUP2_ARG1(__lsx_vneg_h, in16, in17, tmp0, tmp1);
+  DOTP_CONST_PAIR(tmp0, in27, cospi_24_64, cospi_8_64, in20, in27);
+  DOTP_CONST_PAIR(tmp1, in26, cospi_24_64, cospi_8_64, in21, in26);
+
+  in22 = __lsx_vld(interm_ptr, 80);
+  in25 = __lsx_vld(interm_ptr, 96);
+  in24 = __lsx_vld(interm_ptr, 144);
+  in23 = __lsx_vld(interm_ptr, 160);
+
+  DUP4_ARG2(__lsx_vsub_h, in23, in20, in22, in21, in25, in26, in24, in27, in28,
+            in17, in18, in31);
+  DOTP_CONST_PAIR(in18, in17, cospi_12_64, cospi_20_64, in29, in30);
+  in16 = __lsx_vadd_h(in28, in29);
+  in19 = __lsx_vadd_h(in31, in30);
+  DOTP_CONST_PAIR(in19, in16, cospi_27_64, cospi_5_64, vec5, vec4);
+  __lsx_vst(vec5, out, 64);
+  __lsx_vst(vec4, out, 176);
+
+  DUP2_ARG2(__lsx_vsub_h, in28, in29, in31, in30, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_11_64, cospi_21_64, vec5, vec4);
+  __lsx_vst(vec5, out, 80);
+  __lsx_vst(vec4, out, 160);
+
+  DUP4_ARG2(__lsx_vadd_h, in22, in21, in23, in20, in24, in27, in25, in26, in16,
+            in29, in30, in19);
+  tmp0 = __lsx_vneg_h(in16);
+  DOTP_CONST_PAIR(tmp0, in19, cospi_12_64, cospi_20_64, in28, in31);
+  DUP2_ARG2(__lsx_vsub_h, in29, in28, in30, in31, in16, in19);
+  DOTP_CONST_PAIR(in19, in16, cospi_19_64, cospi_13_64, vec5, vec4);
+  __lsx_vst(vec5, out, 144);
+  __lsx_vst(vec4, out, 96);
+
+  DUP2_ARG2(__lsx_vadd_h, in29, in28, in30, in31, in17, in18);
+  DOTP_CONST_PAIR(in18, in17, cospi_3_64, cospi_29_64, vec5, vec4);
+  __lsx_vst(vec4, out, 112);
+  __lsx_vst(vec5, out, 128);
+}
+
+static void fdct32x8_1d_row_rd(int16_t *tmp_buf_big, int16_t *tmp_buf,
+                               int16_t *output) {
+  fdct8x32_1d_row_load_butterfly(tmp_buf_big, tmp_buf);
+  fdct8x32_1d_row_even_rd(tmp_buf, tmp_buf);
+  fdct8x32_1d_row_odd_rd((tmp_buf + 128), tmp_buf_big, (tmp_buf + 128));
+  fdct8x32_1d_row_transpose_store(tmp_buf, output);
+}
+
+void vpx_fdct32x32_rd_lsx(const int16_t *input, int16_t *out,
+                          int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);
+
+  /* column transform */
+  for (i = 0; i < 4; ++i) {
+    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
+                       &tmp_buf_big[0] + (8 * i));
+  }
+  /* row transform */
+  for (i = 0; i < 4; ++i) {
+    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
+                       out + (8 * i * 32));
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c b/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
new file mode 100644
index 0000000..508532b
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.c
@@ -0,0 +1,350 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define LSX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  do {                                                                         \
+    __m128i _s0, _s1, _s2, _s3, _t0, _t1, _t2, _t3;                            \
+                                                                               \
+    DUP2_ARG2(__lsx_vilvl_h, _in2, _in0, _in3, _in1, _s0, _s1);                \
+    DUP2_ARG2(__lsx_vilvh_h, _in2, _in0, _in3, _in1, _s2, _s3);                \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                             \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                             \
+    _t2 = __lsx_vilvl_h(_s3, _s2);                                             \
+    _t3 = __lsx_vilvh_h(_s3, _s2);                                             \
+    DUP2_ARG2(__lsx_vpickev_d, _t2, _t0, _t3, _t1, _out0, _out2);              \
+    DUP2_ARG2(__lsx_vpickod_d, _t2, _t0, _t3, _t1, _out1, _out3);              \
+  } while (0)
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  __m128i stp21, stp22, stp23, stp24, stp25, stp26, stp30;
+  __m128i stp31, stp32, stp33, stp34, stp35, stp36, stp37;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, cnst0, cnst1, cnst4, cnst5;
+  __m128i coeff = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };
+  __m128i coeff1 = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };
+  __m128i coeff2 = { 0xed6cd766c78fc04f, 0x0 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int32_t src_stride8 = src_stride4 << 1;
+  int16_t *input_tmp = (int16_t *)input;
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in1, in2, in3, in4);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in5, in6, in7, in8);
+  input_tmp += src_stride4;
+  DUP4_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4,
+            input_tmp, src_stride6, input_tmp, src_stride8, in9, in10, in11,
+            in12);
+  input_tmp += src_stride4;
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in13,
+            in14);
+  input_tmp += src_stride2;
+  in15 = __lsx_vldx(input_tmp, src_stride2);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vslli_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vslli_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  DUP4_ARG2(__lsx_vadd_h, in0, in15, in1, in14, in2, in13, in3, in12, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP4_ARG2(__lsx_vadd_h, in4, in11, in5, in10, in6, in9, in7, in8, tmp4, tmp5,
+            tmp6, tmp7);
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  __lsx_vst(tmp0, tmp_ptr, 0);
+  __lsx_vst(tmp1, tmp_ptr, 64);
+  __lsx_vst(tmp2, tmp_ptr, 128);
+  __lsx_vst(tmp3, tmp_ptr, 192);
+  __lsx_vst(tmp4, tmp_ptr, 256);
+  __lsx_vst(tmp5, tmp_ptr, 320);
+  __lsx_vst(tmp6, tmp_ptr, 384);
+  __lsx_vst(tmp7, tmp_ptr, 448);
+  DUP4_ARG2(__lsx_vsub_h, in0, in15, in1, in14, in2, in13, in3, in12, in15,
+            in14, in13, in12);
+  DUP4_ARG2(__lsx_vsub_h, in4, in11, in5, in10, in6, in9, in7, in8, in11, in10,
+            in9, in8);
+
+  tmp_ptr += 16;
+
+  /* stp 1 */
+  DUP2_ARG2(__lsx_vilvh_h, in10, in13, in11, in12, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, in10, in13, in11, in12, vec3, vec5);
+
+  cnst4 = __lsx_vreplvei_h(coeff, 0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst4, stp25);
+
+  cnst5 = __lsx_vreplvei_h(coeff, 1);
+  cnst5 = __lsx_vpackev_h(cnst5, cnst4);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst5, stp22);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst4, stp24);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst5, stp23);
+
+  /* stp2 */
+  LSX_BUTTERFLY_4_H(in8, in9, stp22, stp23, stp30, stp31, stp32, stp33);
+  LSX_BUTTERFLY_4_H(in15, in14, stp25, stp24, stp37, stp36, stp35, stp34);
+  DUP2_ARG2(__lsx_vilvh_h, stp36, stp31, stp35, stp32, vec2, vec4);
+  DUP2_ARG2(__lsx_vilvl_h, stp36, stp31, stp35, stp32, vec3, vec5);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 3, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst0, stp26);
+
+  cnst0 = __lsx_vreplvei_h(coeff, 4);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec2, vec3, cnst1, stp21);
+
+  LSX_BUTTERFLY_4_H(stp30, stp37, stp26, stp21, in8, in15, in14, in9);
+  vec1 = __lsx_vilvl_h(in15, in8);
+  vec0 = __lsx_vilvh_h(in15, in8);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 0, coeff1, 1, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 0);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 0);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 448);
+
+  vec1 = __lsx_vilvl_h(in14, in9);
+  vec0 = __lsx_vilvh_h(in14, in9);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 2, coeff1, 3, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 256);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 2);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 192);
+
+  DUP2_ARG2(__lsx_vreplvei_h, coeff, 2, coeff, 5, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp25);
+
+  cnst1 = __lsx_vreplvei_h(coeff, 3);
+  cnst1 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec4, vec5, cnst1, stp22);
+
+  /* stp4 */
+  DUP2_ARG2(__lsx_vadd_h, stp34, stp25, stp33, stp22, in13, in10);
+
+  vec1 = __lsx_vilvl_h(in13, in10);
+  vec0 = __lsx_vilvh_h(in13, in10);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 4, coeff1, 5, cnst0, cnst1);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 128);
+
+  cnst0 = __lsx_vreplvei_h(coeff2, 1);
+  cnst0 = __lsx_vpackev_h(cnst1, cnst0);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 320);
+
+  DUP2_ARG2(__lsx_vsub_h, stp34, stp25, stp33, stp22, in12, in11);
+  vec1 = __lsx_vilvl_h(in12, in11);
+  vec0 = __lsx_vilvh_h(in12, in11);
+  DUP2_ARG2(__lsx_vreplvei_h, coeff1, 6, coeff1, 7, cnst0, cnst1);
+  cnst1 = __lsx_vpackev_h(cnst1, cnst0);
+
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst1, in8);
+  __lsx_vst(in8, tmp_ptr, 384);
+
+  cnst1 = __lsx_vreplvei_h(coeff2, 3);
+  cnst0 = __lsx_vpackev_h(cnst0, cnst1);
+  DOT_SHIFT_RIGHT_PCK_H(vec0, vec1, cnst0, in8);
+  __lsx_vst(in8, tmp_ptr, 64);
+}
+
+void fdct16x8_1d_row(int16_t *input, int16_t *output) {
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  __m128i in8, in9, in10, in11, in12, in13, in14, in15;
+  int16_t *input_tmp = input;
+
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in0, in1, in2,
+            in3);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in4, in5,
+            in6, in7);
+  DUP4_ARG2(__lsx_vld, input_tmp, 16, input_tmp, 48, input_tmp, 80, input_tmp,
+            112, in8, in9, in10, in11);
+  DUP4_ARG2(__lsx_vld, input_tmp, 144, input_tmp, 176, input_tmp, 208,
+            input_tmp, 240, in12, in13, in14, in15);
+
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in8, in9, in10, in11, in12, in13, in14, in15, in8, in9,
+                     in10, in11, in12, in13, in14, in15);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in4, 1, in5, 1, in6, 1, in7, 1, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vaddi_hu, in8, 1, in9, 1, in10, 1, in11, 1, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vaddi_hu, in12, 1, in13, 1, in14, 1, in15, 1, in12, in13,
+            in14, in15);
+
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+  DUP4_ARG2(__lsx_vsrai_h, in8, 2, in9, 2, in10, 2, in11, 2, in8, in9, in10,
+            in11);
+  DUP4_ARG2(__lsx_vsrai_h, in12, 2, in13, 2, in14, 2, in15, 2, in12, in13, in14,
+            in15);
+  LSX_BUTTERFLY_16_H(in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10,
+                     in11, in12, in13, in14, in15, tmp0, tmp1, tmp2, tmp3, tmp4,
+                     tmp5, tmp6, tmp7, in8, in9, in10, in11, in12, in13, in14,
+                     in15);
+  __lsx_vst(in8, input, 0);
+  __lsx_vst(in9, input, 32);
+  __lsx_vst(in10, input, 64);
+  __lsx_vst(in11, input, 96);
+  __lsx_vst(in12, input, 128);
+  __lsx_vst(in13, input, 160);
+  __lsx_vst(in14, input, 192);
+  __lsx_vst(in15, input, 224);
+
+  FDCT8x16_EVEN(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp0, tmp1,
+                tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
+  DUP4_ARG2(__lsx_vld, input, 0, input, 32, input, 64, input, 96, in8, in9,
+            in10, in11);
+  DUP4_ARG2(__lsx_vld, input, 128, input, 160, input, 192, input, 224, in12,
+            in13, in14, in15);
+  FDCT8x16_ODD(in8, in9, in10, in11, in12, in13, in14, in15, in0, in1, in2, in3,
+               in4, in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(tmp0, in0, tmp1, in1, tmp2, in2, tmp3, in3, tmp0, in0,
+                     tmp1, in1, tmp2, in2, tmp3, in3);
+  __lsx_vst(tmp0, output, 0);
+  __lsx_vst(in0, output, 32);
+  __lsx_vst(tmp1, output, 64);
+  __lsx_vst(in1, output, 96);
+  __lsx_vst(tmp2, output, 128);
+  __lsx_vst(in2, output, 160);
+  __lsx_vst(tmp3, output, 192);
+  __lsx_vst(in3, output, 224);
+
+  LSX_TRANSPOSE8x8_H(tmp4, in4, tmp5, in5, tmp6, in6, tmp7, in7, tmp4, in4,
+                     tmp5, in5, tmp6, in6, tmp7, in7);
+  __lsx_vst(tmp4, output, 16);
+  __lsx_vst(in4, output, 48);
+  __lsx_vst(tmp5, output, 80);
+  __lsx_vst(in5, output, 112);
+  __lsx_vst(tmp6, output, 144);
+  __lsx_vst(in6, output, 176);
+  __lsx_vst(tmp7, output, 208);
+  __lsx_vst(in7, output, 240);
+}
+
+void vpx_fdct4x4_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+
+  in0 = __lsx_vld(input, 0);
+  DUP2_ARG2(__lsx_vldx, input, src_stride2, input, src_stride4, in1, in2);
+  in3 = __lsx_vldx(input, src_stride6);
+
+  /* fdct4 pre-process */
+  {
+    __m128i vec, mask;
+    __m128i zero = __lsx_vldi(0);
+
+    mask = __lsx_vinsgr2vr_b(zero, 1, 0);
+    DUP4_ARG2(__lsx_vslli_h, in0, 4, in1, 4, in2, 4, in3, 4, in0, in1, in2,
+              in3);
+    vec = __lsx_vseqi_h(in0, 0);
+    vec = __lsx_vxori_b(vec, 255);
+    vec = __lsx_vand_v(mask, vec);
+    in0 = __lsx_vadd_h(in0, vec);
+  }
+
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  VP9_FDCT4(in0, in1, in2, in3, in0, in1, in2, in3);
+  LSX_TRANSPOSE4x4_H(in0, in1, in2, in3, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vaddi_hu, in0, 1, in1, 1, in2, 1, in3, 1, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vsrai_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP2_ARG2(__lsx_vpickev_d, in1, in0, in3, in2, in0, in2);
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in2, output, 16);
+}
+
+void vpx_fdct8x8_lsx(const int16_t *input, int16_t *output,
+                     int32_t src_stride) {
+  __m128i in0, in1, in2, in3, in4, in5, in6, in7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t src_stride6 = src_stride4 + src_stride2;
+  int16_t *input_tmp = (int16_t *)input;
+
+  in0 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in1,
+            in2);
+  in3 = __lsx_vldx(input_tmp, src_stride6);
+  input_tmp += src_stride4;
+  in4 = __lsx_vld(input_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, input_tmp, src_stride2, input_tmp, src_stride4, in5,
+            in6);
+  in7 = __lsx_vldx(input_tmp, src_stride6);
+
+  DUP4_ARG2(__lsx_vslli_h, in0, 2, in1, 2, in2, 2, in3, 2, in0, in1, in2, in3);
+  DUP4_ARG2(__lsx_vslli_h, in4, 2, in5, 2, in6, 2, in7, 2, in4, in5, in6, in7);
+
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3, in4,
+            in5, in6, in7);
+  LSX_TRANSPOSE8x8_H(in0, in1, in2, in3, in4, in5, in6, in7, in0, in1, in2, in3,
+                     in4, in5, in6, in7);
+  SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7);
+
+  __lsx_vst(in0, output, 0);
+  __lsx_vst(in1, output, 16);
+  __lsx_vst(in2, output, 32);
+  __lsx_vst(in3, output, 48);
+  __lsx_vst(in4, output, 64);
+  __lsx_vst(in5, output, 80);
+  __lsx_vst(in6, output, 96);
+  __lsx_vst(in7, output, 112);
+}
+
+void vpx_fdct16x16_lsx(const int16_t *input, int16_t *output,
+                       int32_t src_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);
+
+  /* column transform */
+  for (i = 0; i < 2; ++i) {
+    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
+  }
+
+  /* row transform */
+  for (i = 0; i < 2; ++i) {
+    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
+  }
+}
+#endif  // !CONFIG_VP9_HIGHBITDEPTH
diff --git a/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h b/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
new file mode 100644
index 0000000..4a9fce9
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/fwd_txfm_lsx.h
@@ -0,0 +1,381 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
+
+#include "vpx_dsp/loongarch/txfm_macros_lsx.h"
+#include "vpx_dsp/txfm_common.h"
+
+#define VP9_FDCT4(in0, in1, in2, in3, out0, out1, out2, out3)                 \
+  do {                                                                        \
+    __m128i cnst0_m, cnst1_m, cnst2_m, cnst3_m;                               \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m;                                   \
+    __m128i vec4_m, vec5_m, vec6_m, vec7_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x000000000000c4df };             \
+                                                                              \
+    LSX_BUTTERFLY_4_H(in0, in1, in2, in3, vec0_m, vec1_m, vec2_m, vec3_m);    \
+    DUP2_ARG2(__lsx_vilvl_h, vec1_m, vec0_m, vec3_m, vec2_m, vec0_m, vec2_m); \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    vec5_m = __lsx_vdp2_w_h(vec0_m, cnst1_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 3, cnst2_m, cnst3_m);    \
+    cnst2_m = __lsx_vpackev_h(cnst3_m, cnst2_m);                              \
+    vec7_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    vec4_m = __lsx_vdp2_w_h(vec0_m, cnst0_m);                                 \
+    cnst2_m = __lsx_vreplvei_h(coeff_m, 2);                                   \
+    cnst2_m = __lsx_vpackev_h(cnst2_m, cnst3_m);                              \
+    vec6_m = __lsx_vdp2_w_h(vec2_m, cnst2_m);                                 \
+                                                                              \
+    DUP4_ARG3(__lsx_vssrarni_h_w, vec4_m, vec4_m, DCT_CONST_BITS, vec5_m,     \
+              vec5_m, DCT_CONST_BITS, vec6_m, vec6_m, DCT_CONST_BITS, vec7_m, \
+              vec7_m, DCT_CONST_BITS, out0, out2, out1, out3);                \
+  } while (0)
+
+#define VP9_FDCT8(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, out2, \
+                  out3, out4, out5, out6, out7)                             \
+  do {                                                                      \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m;                       \
+    __m128i s7_m, x0_m, x1_m, x2_m, x3_m;                                   \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };           \
+                                                                            \
+    /* FDCT stage1 */                                                       \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m,   \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                  \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);      \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);           \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);        \
+    x2_m = __lsx_vneg_h(x2_m);                                              \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                          \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                    \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                          \
+                                                                            \
+    /* stage2 */                                                            \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                       \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                       \
+                                                                            \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                          \
+                                                                            \
+    /* stage3 */                                                            \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);      \
+                                                                            \
+    /* stage4 */                                                            \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);           \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);           \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);        \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                          \
+                                                                            \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);        \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                          \
+                                                                            \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                    \
+    x0_m = __lsx_vneg_h(x0_m);                                              \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                    \
+    x3_m = __lsx_vneg_h(x3_m);                                              \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                     \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                          \
+  } while (0)
+
+#define SRLI_AVE_S_4V_H(in0, in1, in2, in3, in4, in5, in6, in7)             \
+  do {                                                                      \
+    __m128i vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
+                                                                            \
+    DUP4_ARG2(__lsx_vsrli_h, in0, 15, in1, 15, in2, 15, in3, 15, vec0_m,    \
+              vec1_m, vec2_m, vec3_m);                                      \
+    DUP4_ARG2(__lsx_vsrli_h, in4, 15, in5, 15, in6, 15, in7, 15, vec4_m,    \
+              vec5_m, vec6_m, vec7_m);                                      \
+    DUP4_ARG2(__lsx_vavg_h, vec0_m, in0, vec1_m, in1, vec2_m, in2, vec3_m,  \
+              in3, in0, in1, in2, in3);                                     \
+    DUP4_ARG2(__lsx_vavg_h, vec4_m, in4, vec5_m, in5, vec6_m, in6, vec7_m,  \
+              in7, in4, in5, in6, in7);                                     \
+  } while (0)
+
+#define FDCT32_POSTPROC_2V_POS_H(vec0, vec1) \
+  do {                                       \
+    __m128i tp0_m, tp1_m;                    \
+    __m128i one = __lsx_vreplgr2vr_h(1);     \
+                                             \
+    tp0_m = __lsx_vslei_h(vec0, 0);          \
+    tp1_m = __lsx_vslei_h(vec1, 0);          \
+    tp0_m = __lsx_vxori_b(tp0_m, 255);       \
+    tp1_m = __lsx_vxori_b(tp1_m, 255);       \
+    vec0 = __lsx_vadd_h(vec0, one);          \
+    vec1 = __lsx_vadd_h(vec1, one);          \
+    tp0_m = __lsx_vand_v(one, tp0_m);        \
+    tp1_m = __lsx_vand_v(one, tp1_m);        \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);        \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);        \
+    vec0 = __lsx_vsrai_h(vec0, 2);           \
+    vec1 = __lsx_vsrai_h(vec1, 2);           \
+  } while (0)
+
+#define FDCT_POSTPROC_2V_NEG_H(vec0, vec1) \
+  do {                                     \
+    __m128i tp0_m, tp1_m;                  \
+    __m128i one_m = __lsx_vldi(0x401);     \
+                                           \
+    tp0_m = __lsx_vslti_h(vec0, 0);        \
+    tp1_m = __lsx_vslti_h(vec1, 0);        \
+    vec0 = __lsx_vadd_h(vec0, one_m);      \
+    vec1 = __lsx_vadd_h(vec1, one_m);      \
+    tp0_m = __lsx_vand_v(one_m, tp0_m);    \
+    tp1_m = __lsx_vand_v(one_m, tp1_m);    \
+    vec0 = __lsx_vadd_h(vec0, tp0_m);      \
+    vec1 = __lsx_vadd_h(vec1, tp1_m);      \
+    vec0 = __lsx_vsrai_h(vec0, 2);         \
+    vec1 = __lsx_vsrai_h(vec1, 2);         \
+  } while (0)
+
+#define FDCT32_POSTPROC_NEG_W(vec)         \
+  do {                                     \
+    __m128i temp_m;                        \
+    __m128i one_m = __lsx_vreplgr2vr_w(1); \
+                                           \
+    temp_m = __lsx_vslti_w(vec, 0);        \
+    vec = __lsx_vadd_w(vec, one_m);        \
+    temp_m = __lsx_vand_v(one_m, temp_m);  \
+    vec = __lsx_vadd_w(vec, temp_m);       \
+    vec = __lsx_vsrai_w(vec, 2);           \
+  } while (0)
+
+#define DOTP_CONST_PAIR_W(reg0_left, reg1_left, reg0_right, reg1_right,       \
+                          const0, const1, out0, out1, out2, out3)             \
+  do {                                                                        \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;                   \
+    __m128i tp0_m, tp1_m, tp2_m, tp3_m, _tmp0, _tmp1;                         \
+    __m128i k0_m = __lsx_vreplgr2vr_w((int32_t)const0);                       \
+                                                                              \
+    s0_m = __lsx_vreplgr2vr_w((int32_t)const1);                               \
+    k0_m = __lsx_vpackev_w(s0_m, k0_m);                                       \
+                                                                              \
+    DUP2_ARG1(__lsx_vneg_w, reg1_left, reg1_right, _tmp0, _tmp1);             \
+    s1_m = __lsx_vilvl_w(_tmp0, reg0_left);                                   \
+    s0_m = __lsx_vilvh_w(_tmp0, reg0_left);                                   \
+    s3_m = __lsx_vilvl_w(reg0_left, reg1_left);                               \
+    s2_m = __lsx_vilvh_w(reg0_left, reg1_left);                               \
+    s5_m = __lsx_vilvl_w(_tmp1, reg0_right);                                  \
+    s4_m = __lsx_vilvh_w(_tmp1, reg0_right);                                  \
+    s7_m = __lsx_vilvl_w(reg0_right, reg1_right);                             \
+    s6_m = __lsx_vilvh_w(reg0_right, reg1_right);                             \
+    DUP2_ARG2(__lsx_vdp2_d_w, s0_m, k0_m, s1_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s4_m, k0_m, s5_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out0, out1);                                    \
+    DUP2_ARG2(__lsx_vdp2_d_w, s2_m, k0_m, s3_m, k0_m, tp0_m, tp1_m);          \
+    DUP2_ARG2(__lsx_vdp2_d_w, s6_m, k0_m, s7_m, k0_m, tp2_m, tp3_m);          \
+    DUP2_ARG3(__lsx_vssrarni_w_d, tp0_m, tp1_m, DCT_CONST_BITS, tp2_m, tp3_m, \
+              DCT_CONST_BITS, out2, out3);                                    \
+  } while (0)
+
+#define VP9_ADDBLK_ST8x4_UB(dst, _stride, _stride2, _stride3, in0, in1, in2,   \
+                            in3)                                               \
+  do {                                                                         \
+    __m128i dst0_m, dst1_m, dst2_m, dst3_m;                                    \
+    __m128i tmp0_m, tmp1_m;                                                    \
+    __m128i res0_m, res1_m, res2_m, res3_m;                                    \
+                                                                               \
+    dst0_m = __lsx_vld(dst, 0);                                                \
+    DUP2_ARG2(__lsx_vldx, dst, _stride, dst, _stride2, dst1_m, dst2_m);        \
+    dst3_m = __lsx_vldx(dst, _stride3);                                        \
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, dst0_m, 0, dst1_m, 0, dst2_m, 0, dst3_m, 0, \
+              res0_m, res1_m, res2_m, res3_m);                                 \
+    DUP4_ARG2(__lsx_vadd_h, res0_m, in0, res1_m, in1, res2_m, in2, res3_m,     \
+              in3, res0_m, res1_m, res2_m, res3_m);                            \
+    DUP2_ARG3(__lsx_vssrarni_bu_h, res1_m, res0_m, 0, res3_m, res2_m, 0,       \
+              tmp0_m, tmp1_m);                                                 \
+    __lsx_vstelm_d(tmp0_m, dst, 0, 0);                                         \
+    __lsx_vstelm_d(tmp0_m, dst + _stride, 0, 1);                               \
+    __lsx_vstelm_d(tmp1_m, dst + _stride2, 0, 0);                              \
+    __lsx_vstelm_d(tmp1_m, dst + _stride3, 0, 1);                              \
+  } while (0)
+
+#define FDCT8x16_EVEN(in0, in1, in2, in3, in4, in5, in6, in7, out0, out1, \
+                      out2, out3, out4, out5, out6, out7)                 \
+  do {                                                                    \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m, s6_m, s7_m;               \
+    __m128i x0_m, x1_m, x2_m, x3_m;                                       \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e35370c7c3ec5 };         \
+                                                                          \
+    /* FDCT stage1 */                                                     \
+    LSX_BUTTERFLY_8_H(in0, in1, in2, in3, in4, in5, in6, in7, s0_m, s1_m, \
+                      s2_m, s3_m, s4_m, s5_m, s6_m, s7_m);                \
+    LSX_BUTTERFLY_4_H(s0_m, s1_m, s2_m, s3_m, x0_m, x1_m, x2_m, x3_m);    \
+    DUP2_ARG2(__lsx_vilvh_h, x1_m, x0_m, x3_m, x2_m, s0_m, s2_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x1_m, x0_m, x3_m, x2_m, s1_m, s3_m);         \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 0, coeff_m, 1, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, out4);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, x2_m, x3_m);      \
+    x2_m = __lsx_vneg_h(x2_m);                                            \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out6);                        \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, out0);                        \
+    x2_m = __lsx_vreplvei_h(coeff_m, 2);                                  \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s2_m, s3_m, x2_m, out2);                        \
+                                                                          \
+    /* stage2 */                                                          \
+    s1_m = __lsx_vilvl_h(s5_m, s6_m);                                     \
+    s0_m = __lsx_vilvh_h(s5_m, s6_m);                                     \
+                                                                          \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x0_m, s6_m);                        \
+    DOT_SHIFT_RIGHT_PCK_H(s0_m, s1_m, x1_m, s5_m);                        \
+                                                                          \
+    /* stage3 */                                                          \
+    LSX_BUTTERFLY_4_H(s4_m, s7_m, s6_m, s5_m, x0_m, x3_m, x2_m, x1_m);    \
+                                                                          \
+    /* stage4 */                                                          \
+    DUP2_ARG2(__lsx_vilvh_h, x3_m, x0_m, x2_m, x1_m, s4_m, s6_m);         \
+    DUP2_ARG2(__lsx_vilvl_h, x3_m, x0_m, x2_m, x1_m, s5_m, s7_m);         \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 4, coeff_m, 5, x0_m, x1_m);      \
+    x1_m = __lsx_vpackev_h(x0_m, x1_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x1_m, out1);                        \
+                                                                          \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 6, coeff_m, 7, x2_m, x3_m);      \
+    x2_m = __lsx_vpackev_h(x3_m, x2_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out5);                        \
+                                                                          \
+    x1_m = __lsx_vreplvei_h(coeff_m, 5);                                  \
+    x0_m = __lsx_vneg_h(x0_m);                                            \
+    x0_m = __lsx_vpackev_h(x1_m, x0_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s4_m, s5_m, x0_m, out7);                        \
+                                                                          \
+    x2_m = __lsx_vreplvei_h(coeff_m, 6);                                  \
+    x3_m = __lsx_vneg_h(x3_m);                                            \
+    x2_m = __lsx_vpackev_h(x2_m, x3_m);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(s6_m, s7_m, x2_m, out3);                        \
+  } while (0)
+
+#define FDCT8x16_ODD(input0, input1, input2, input3, input4, input5, input6,  \
+                     input7, out1, out3, out5, out7, out9, out11, out13,      \
+                     out15)                                                   \
+  do {                                                                        \
+    __m128i stp21_m, stp22_m, stp23_m, stp24_m, stp25_m, stp26_m;             \
+    __m128i stp30_m, stp31_m, stp32_m, stp33_m, stp34_m, stp35_m;             \
+    __m128i stp36_m, stp37_m, vec0_m, vec1_m;                                 \
+    __m128i vec2_m, vec3_m, vec4_m, vec5_m, vec6_m;                           \
+    __m128i cnst0_m, cnst1_m, cnst4_m, cnst5_m;                               \
+    __m128i coeff_m = { 0x187e3b21d2bf2d41, 0x238e3537e782c4df };             \
+    __m128i coeff1_m = { 0x289a317906463fb1, 0x12943d3f1e2b3871 };            \
+    __m128i coeff2_m = { 0xed6cd766c78fc04f, 0x0 };                           \
+                                                                              \
+    /* stp 1 */                                                               \
+    DUP2_ARG2(__lsx_vilvh_h, input2, input5, input3, input4, vec2_m, vec4_m); \
+    DUP2_ARG2(__lsx_vilvl_h, input2, input5, input3, input4, vec3_m, vec5_m); \
+                                                                              \
+    cnst4_m = __lsx_vreplvei_h(coeff_m, 0);                                   \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst4_m, stp25_m);                  \
+                                                                              \
+    cnst5_m = __lsx_vreplvei_h(coeff_m, 1);                                   \
+    cnst5_m = __lsx_vpackev_h(cnst5_m, cnst4_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst5_m, stp22_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst4_m, stp24_m);                  \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst5_m, stp23_m);                  \
+                                                                              \
+    /* stp2 */                                                                \
+    LSX_BUTTERFLY_4_H(input0, input1, stp22_m, stp23_m, stp30_m, stp31_m,     \
+                      stp32_m, stp33_m);                                      \
+    LSX_BUTTERFLY_4_H(input7, input6, stp25_m, stp24_m, stp37_m, stp36_m,     \
+                      stp35_m, stp34_m);                                      \
+                                                                              \
+    DUP2_ARG2(__lsx_vilvh_h, stp36_m, stp31_m, stp35_m, stp32_m, vec2_m,      \
+              vec4_m);                                                        \
+    DUP2_ARG2(__lsx_vilvl_h, stp36_m, stp31_m, stp35_m, stp32_m, vec3_m,      \
+              vec5_m);                                                        \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 2, coeff_m, 3, cnst0_m, cnst1_m);    \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst0_m, stp26_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 4);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec2_m, vec3_m, cnst1_m, stp21_m);                  \
+                                                                              \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff_m, 5, coeff_m, 2, cnst0_m, cnst1_m);    \
+    cnst1_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp25_m);                  \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff_m, 3);                                   \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec4_m, vec5_m, cnst1_m, stp22_m);                  \
+                                                                              \
+    /* stp4 */                                                                \
+    LSX_BUTTERFLY_4_H(stp30_m, stp37_m, stp26_m, stp21_m, vec6_m, vec2_m,     \
+                      vec4_m, vec5_m);                                        \
+    LSX_BUTTERFLY_4_H(stp33_m, stp34_m, stp25_m, stp22_m, stp21_m, stp23_m,   \
+                      stp24_m, stp31_m);                                      \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec2_m, vec6_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec2_m, vec6_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 0, coeff1_m, 1, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out1);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 0);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out15);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(vec4_m, vec5_m);                                   \
+    vec0_m = __lsx_vilvh_h(vec4_m, vec5_m);                                   \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 2, coeff1_m, 3, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out9);                     \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 2);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out7);                     \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp23_m, stp21_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp23_m, stp21_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 4, coeff1_m, 5, cnst0_m, cnst1_m);  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out5);                     \
+                                                                              \
+    cnst0_m = __lsx_vreplvei_h(coeff2_m, 1);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out11);                    \
+                                                                              \
+    vec1_m = __lsx_vilvl_h(stp24_m, stp31_m);                                 \
+    vec0_m = __lsx_vilvh_h(stp24_m, stp31_m);                                 \
+    DUP2_ARG2(__lsx_vreplvei_h, coeff1_m, 6, coeff1_m, 7, cnst0_m, cnst1_m);  \
+    cnst1_m = __lsx_vpackev_h(cnst1_m, cnst0_m);                              \
+                                                                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst1_m, out13);                    \
+                                                                              \
+    cnst1_m = __lsx_vreplvei_h(coeff2_m, 3);                                  \
+    cnst0_m = __lsx_vpackev_h(cnst0_m, cnst1_m);                              \
+    DOT_SHIFT_RIGHT_PCK_H(vec0_m, vec1_m, cnst0_m, out3);                     \
+  } while (0)
+
+void fdct8x16_1d_column(const int16_t *input, int16_t *tmp_ptr,
+                        int32_t src_stride);
+void fdct16x8_1d_row(int16_t *input, int16_t *output);
+#endif  // VPX_VPX_DSP_LOONGARCH_FWD_TXFM_LSX_H_
diff --git a/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c b/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
new file mode 100644
index 0000000..ec07f57
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/idct32x32_lsx.c
@@ -0,0 +1,834 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/fwd_txfm_lsx.h"
+
+#define UNPCK_UB_SH(_in, _out0, _out1)   \
+  do {                                   \
+    _out0 = __lsx_vsllwil_hu_bu(_in, 0); \
+    _out1 = __lsx_vexth_hu_bu(_in);      \
+  } while (0)
+
+static void idct32x8_row_transpose_store(const int16_t *input,
+                                         int16_t *tmp_buf) {
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+
+  /* 1st & 2nd 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 0, input, 64, input, 128, input, 192, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 256, input, 320, input, 384, input, 448, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 16, input, 80, input, 144, input, 208, m4, n4, m5,
+            n5);
+  DUP4_ARG2(__lsx_vld, input, 272, input, 336, input, 400, input, 464, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 0);
+  __lsx_vst(n0, tmp_buf, 16);
+  __lsx_vst(m1, tmp_buf, 32);
+  __lsx_vst(n1, tmp_buf, 48);
+  __lsx_vst(m2, tmp_buf, 64);
+  __lsx_vst(n2, tmp_buf, 80);
+  __lsx_vst(m3, tmp_buf, 96);
+  __lsx_vst(n3, tmp_buf, 112);
+  __lsx_vst(m4, tmp_buf, 128);
+  __lsx_vst(n4, tmp_buf, 144);
+  __lsx_vst(m5, tmp_buf, 160);
+  __lsx_vst(n5, tmp_buf, 176);
+  __lsx_vst(m6, tmp_buf, 192);
+  __lsx_vst(n6, tmp_buf, 208);
+  __lsx_vst(m7, tmp_buf, 224);
+  __lsx_vst(n7, tmp_buf, 240);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, input, 32, input, 96, input, 160, input, 224, m0, n0, m1,
+            n1);
+  DUP4_ARG2(__lsx_vld, input, 288, input, 352, input, 416, input, 480, m2, n2,
+            m3, n3);
+  DUP4_ARG2(__lsx_vld, input, 48, input, 112, input, 176, input, 240, m4, n4,
+            m5, n5);
+  DUP4_ARG2(__lsx_vld, input, 304, input, 368, input, 432, input, 496, m6, n6,
+            m7, n7);
+
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+
+  __lsx_vst(m0, tmp_buf, 256);
+  __lsx_vst(n0, tmp_buf, 272);
+  __lsx_vst(m1, tmp_buf, 288);
+  __lsx_vst(n1, tmp_buf, 304);
+  __lsx_vst(m2, tmp_buf, 320);
+  __lsx_vst(n2, tmp_buf, 336);
+  __lsx_vst(m3, tmp_buf, 352);
+  __lsx_vst(n3, tmp_buf, 368);
+  __lsx_vst(m4, tmp_buf, 384);
+  __lsx_vst(n4, tmp_buf, 400);
+  __lsx_vst(m5, tmp_buf, 416);
+  __lsx_vst(n5, tmp_buf, 432);
+  __lsx_vst(m6, tmp_buf, 448);
+  __lsx_vst(n6, tmp_buf, 464);
+  __lsx_vst(m7, tmp_buf, 480);
+  __lsx_vst(n7, tmp_buf, 496);
+}
+
+static void idct32x8_row_even_process_store(int16_t *tmp_buf,
+                                            int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 64, tmp_buf, 128, tmp_buf, 192,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 320, tmp_buf, 384, tmp_buf, 448,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 32, tmp_buf, 96, tmp_buf, 160, tmp_buf, 224,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 288, tmp_buf, 352, tmp_buf, 416, tmp_buf, 480,
+            reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+}
+
+static void idct32x8_row_odd_process_store(int16_t *tmp_buf,
+                                           int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 16, tmp_buf, 112, tmp_buf, 144, tmp_buf, 240,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 272, tmp_buf, 368, tmp_buf, 400, tmp_buf, 496,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 48, tmp_buf, 80, tmp_buf, 176, tmp_buf, 208,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 304, tmp_buf, 336, tmp_buf, 432, tmp_buf, 464,
+            reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+
+  LSX_BUTTERFLY_4_H(loc3, loc2, loc0, loc1, vec1, vec0, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec1,
+            vec2, vec0, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct_butterfly_transpose_store(int16_t *tmp_buf,
+                                           int16_t *tmp_eve_buf,
+                                           int16_t *tmp_odd_buf, int16_t *dst) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  __m128i reg0, reg1, reg2, reg3;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 496);
+  __lsx_vst(reg1, tmp_buf, 368);
+  __lsx_vst(reg2, tmp_buf, 432);
+  __lsx_vst(reg3, tmp_buf, 304);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 464);
+  __lsx_vst(reg1, tmp_buf, 336);
+  __lsx_vst(reg2, tmp_buf, 400);
+  __lsx_vst(reg3, tmp_buf, 272);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 480);
+  __lsx_vst(reg1, tmp_buf, 352);
+  __lsx_vst(reg2, tmp_buf, 416);
+  __lsx_vst(reg3, tmp_buf, 288);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, reg0,
+            reg1, reg2, reg3);
+  __lsx_vst(reg0, tmp_buf, 448);
+  __lsx_vst(reg1, tmp_buf, 320);
+  __lsx_vst(reg2, tmp_buf, 384);
+  __lsx_vst(reg3, tmp_buf, 256);
+
+  /* Transpose : 16 vectors */
+  /* 1st & 2nd 8x8 */
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  __lsx_vst(m0, dst, 0);
+  __lsx_vst(n0, dst, 64);
+  __lsx_vst(m1, dst, 128);
+  __lsx_vst(n1, dst, 192);
+  __lsx_vst(m2, dst, 256);
+  __lsx_vst(n2, dst, 320);
+  __lsx_vst(m3, dst, 384);
+  __lsx_vst(n3, dst, 448);
+
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m4, dst, 16);
+  __lsx_vst(n4, dst, 80);
+  __lsx_vst(m5, dst, 144);
+  __lsx_vst(n5, dst, 208);
+  __lsx_vst(m6, dst, 272);
+  __lsx_vst(n6, dst, 336);
+  __lsx_vst(m7, dst, 400);
+  __lsx_vst(n7, dst, 464);
+
+  /* 3rd & 4th 8x8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 256, tmp_buf, 272, tmp_buf, 288, tmp_buf, 304,
+            m0, n0, m1, n1);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 320, tmp_buf, 336, tmp_buf, 352, tmp_buf, 368,
+            m2, n2, m3, n3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 384, tmp_buf, 400, tmp_buf, 416, tmp_buf, 432,
+            m4, n4, m5, n5);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 448, tmp_buf, 464, tmp_buf, 480, tmp_buf, 496,
+            m6, n6, m7, n7);
+  LSX_TRANSPOSE8x8_H(m0, n0, m1, n1, m2, n2, m3, n3, m0, n0, m1, n1, m2, n2, m3,
+                     n3);
+  LSX_TRANSPOSE8x8_H(m4, n4, m5, n5, m6, n6, m7, n7, m4, n4, m5, n5, m6, n6, m7,
+                     n7);
+  __lsx_vst(m0, dst, 32);
+  __lsx_vst(n0, dst, 96);
+  __lsx_vst(m1, dst, 160);
+  __lsx_vst(n1, dst, 224);
+  __lsx_vst(m2, dst, 288);
+  __lsx_vst(n2, dst, 352);
+  __lsx_vst(m3, dst, 416);
+  __lsx_vst(n3, dst, 480);
+  __lsx_vst(m4, dst, 48);
+  __lsx_vst(n4, dst, 112);
+  __lsx_vst(m5, dst, 176);
+  __lsx_vst(n5, dst, 240);
+  __lsx_vst(m6, dst, 304);
+  __lsx_vst(n6, dst, 368);
+  __lsx_vst(m7, dst, 432);
+  __lsx_vst(n7, dst, 496);
+}
+
+static void idct32x8_1d_rows_lsx(const int16_t *input, int16_t *output) {
+  DECLARE_ALIGNED(32, int16_t, tmp_buf[8 * 32]);
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct32x8_row_transpose_store(input, &tmp_buf[0]);
+  idct32x8_row_even_process_store(&tmp_buf[0], &tmp_eve_buf[0]);
+  idct32x8_row_odd_process_store(&tmp_buf[0], &tmp_odd_buf[0]);
+  idct_butterfly_transpose_store(&tmp_buf[0], &tmp_eve_buf[0], &tmp_odd_buf[0],
+                                 output);
+}
+
+static void idct8x32_column_even_process_store(int16_t *tmp_buf,
+                                               int16_t *tmp_eve_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i stp0, stp1, stp2, stp3, stp4, stp5, stp6, stp7;
+  __m128i tmp0;
+
+  /* Even stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  tmp_buf += 64;
+
+  DOTP_CONST_PAIR(reg1, reg7, cospi_28_64, cospi_4_64, reg1, reg7);
+  DOTP_CONST_PAIR(reg5, reg3, cospi_12_64, cospi_20_64, reg5, reg3);
+  LSX_BUTTERFLY_4_H(reg1, reg7, reg3, reg5, vec1, vec3, vec2, vec0);
+  DOTP_CONST_PAIR(vec2, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+
+  loc1 = vec3;
+  loc0 = vec1;
+
+  DOTP_CONST_PAIR(reg0, reg4, cospi_16_64, cospi_16_64, reg0, reg4);
+  DOTP_CONST_PAIR(reg2, reg6, cospi_24_64, cospi_8_64, reg2, reg6);
+  LSX_BUTTERFLY_4_H(reg4, reg0, reg2, reg6, vec1, vec3, vec2, vec0);
+  LSX_BUTTERFLY_4_H(vec0, vec1, loc1, loc0, stp3, stp0, stp7, stp4);
+  LSX_BUTTERFLY_4_H(vec2, vec3, loc3, loc2, stp2, stp1, stp6, stp5);
+
+  /* Even stage 2 */
+  /* Load 8 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 0, tmp_buf, 256, tmp_buf, 512, tmp_buf, 768,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1024, tmp_buf, 1280, tmp_buf, 1536, tmp_buf,
+            1792, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg0, reg7, cospi_30_64, cospi_2_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_14_64, cospi_18_64, reg4, reg3);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_22_64, cospi_10_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_6_64, cospi_26_64, reg6, reg1);
+
+  vec0 = __lsx_vadd_h(reg0, reg4);
+  reg0 = __lsx_vsub_h(reg0, reg4);
+  reg4 = __lsx_vadd_h(reg6, reg2);
+  reg6 = __lsx_vsub_h(reg6, reg2);
+  reg2 = __lsx_vadd_h(reg1, reg5);
+  reg1 = __lsx_vsub_h(reg1, reg5);
+  reg5 = __lsx_vadd_h(reg7, reg3);
+  reg7 = __lsx_vsub_h(reg7, reg3);
+  reg3 = vec0;
+
+  vec1 = reg2;
+  reg2 = __lsx_vadd_h(reg3, reg4);
+  reg3 = __lsx_vsub_h(reg3, reg4);
+  reg4 = __lsx_vsub_h(reg5, vec1);
+  reg5 = __lsx_vadd_h(reg5, vec1);
+
+  tmp0 = __lsx_vneg_h(reg6);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_24_64, cospi_8_64, reg0, reg7);
+  DOTP_CONST_PAIR(tmp0, reg1, cospi_24_64, cospi_8_64, reg6, reg1);
+
+  vec0 = __lsx_vsub_h(reg0, reg6);
+  reg0 = __lsx_vadd_h(reg0, reg6);
+  vec1 = __lsx_vsub_h(reg7, reg1);
+  reg7 = __lsx_vadd_h(reg7, reg1);
+
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, reg6, reg1);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_16_64, cospi_16_64, reg3, reg4);
+
+  /* Even stage 3 : Dependency on Even stage 1 & Even stage 2 */
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp0, stp1, reg7, reg5, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 0);
+  __lsx_vst(loc3, tmp_eve_buf, 16);
+  __lsx_vst(loc2, tmp_eve_buf, 224);
+  __lsx_vst(loc0, tmp_eve_buf, 240);
+
+  LSX_BUTTERFLY_4_H(stp2, stp3, reg4, reg1, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 32);
+  __lsx_vst(loc3, tmp_eve_buf, 48);
+  __lsx_vst(loc2, tmp_eve_buf, 192);
+  __lsx_vst(loc0, tmp_eve_buf, 208);
+
+  /* Store 8 */
+  LSX_BUTTERFLY_4_H(stp4, stp5, reg6, reg3, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 64);
+  __lsx_vst(loc3, tmp_eve_buf, 80);
+  __lsx_vst(loc2, tmp_eve_buf, 160);
+  __lsx_vst(loc0, tmp_eve_buf, 176);
+
+  LSX_BUTTERFLY_4_H(stp6, stp7, reg2, reg0, loc1, loc3, loc2, loc0);
+  __lsx_vst(loc1, tmp_eve_buf, 96);
+  __lsx_vst(loc3, tmp_eve_buf, 112);
+  __lsx_vst(loc2, tmp_eve_buf, 128);
+  __lsx_vst(loc0, tmp_eve_buf, 144);
+}
+
+static void idct8x32_column_odd_process_store(int16_t *tmp_buf,
+                                              int16_t *tmp_odd_buf) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+
+  /* Odd stage 1 */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 64, tmp_buf, 448, tmp_buf, 576, tmp_buf, 960,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1088, tmp_buf, 1472, tmp_buf, 1600, tmp_buf,
+            1984, reg4, reg5, reg6, reg7);
+
+  DOTP_CONST_PAIR(reg0, reg7, cospi_31_64, cospi_1_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg4, reg3, cospi_15_64, cospi_17_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg2, reg5, cospi_23_64, cospi_9_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg6, reg1, cospi_7_64, cospi_25_64, reg1, reg6);
+
+  vec0 = __lsx_vadd_h(reg0, reg3);
+  reg0 = __lsx_vsub_h(reg0, reg3);
+  reg3 = __lsx_vadd_h(reg7, reg4);
+  reg7 = __lsx_vsub_h(reg7, reg4);
+  reg4 = __lsx_vadd_h(reg1, reg2);
+  reg1 = __lsx_vsub_h(reg1, reg2);
+  reg2 = __lsx_vadd_h(reg6, reg5);
+  reg6 = __lsx_vsub_h(reg6, reg5);
+  reg5 = vec0;
+
+  /* 4 Stores */
+  DUP2_ARG2(__lsx_vadd_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 64);
+  __lsx_vst(vec1, tmp_odd_buf, 80);
+  DUP2_ARG2(__lsx_vsub_h, reg5, reg4, reg3, reg2, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_24_64, cospi_8_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 0);
+  __lsx_vst(vec1, tmp_odd_buf, 16);
+
+  /* 4 Stores */
+  DOTP_CONST_PAIR(reg7, reg0, cospi_28_64, cospi_4_64, reg0, reg7);
+  DOTP_CONST_PAIR(reg6, reg1, -cospi_4_64, cospi_28_64, reg1, reg6);
+  LSX_BUTTERFLY_4_H(reg0, reg7, reg6, reg1, vec0, vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec2, vec3, cospi_24_64, cospi_8_64, vec2, vec3);
+  __lsx_vst(vec0, tmp_odd_buf, 96);
+  __lsx_vst(vec1, tmp_odd_buf, 112);
+  __lsx_vst(vec2, tmp_odd_buf, 32);
+  __lsx_vst(vec3, tmp_odd_buf, 48);
+
+  /* Odd stage 2 */
+  /* 8 loads */
+  DUP4_ARG2(__lsx_vld, tmp_buf, 192, tmp_buf, 320, tmp_buf, 704, tmp_buf, 832,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_buf, 1216, tmp_buf, 1344, tmp_buf, 1728, tmp_buf,
+            1856, reg4, reg5, reg6, reg7);
+  DOTP_CONST_PAIR(reg1, reg6, cospi_27_64, cospi_5_64, reg1, reg6);
+  DOTP_CONST_PAIR(reg5, reg2, cospi_11_64, cospi_21_64, reg2, reg5);
+  DOTP_CONST_PAIR(reg3, reg4, cospi_19_64, cospi_13_64, reg3, reg4);
+  DOTP_CONST_PAIR(reg7, reg0, cospi_3_64, cospi_29_64, reg0, reg7);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vsub_h, reg1, reg2, reg6, reg5, reg0, reg3, reg7, reg4, vec0,
+            vec1, vec2, vec3);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_12_64, cospi_20_64, loc0, loc1);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_20_64, cospi_12_64, loc2, loc3);
+  LSX_BUTTERFLY_4_H(loc2, loc3, loc1, loc0, vec0, vec1, vec3, vec2);
+  __lsx_vst(vec0, tmp_odd_buf, 192);
+  __lsx_vst(vec1, tmp_odd_buf, 240);
+  DOTP_CONST_PAIR(vec3, vec2, -cospi_8_64, cospi_24_64, vec0, vec1);
+  __lsx_vst(vec0, tmp_odd_buf, 160);
+  __lsx_vst(vec1, tmp_odd_buf, 176);
+
+  /* 4 Stores */
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg3, reg1, reg2, reg5, reg6, reg4, reg7, vec0,
+            vec1, vec2, vec3);
+  LSX_BUTTERFLY_4_H(vec0, vec3, vec2, vec1, reg0, reg1, reg3, reg2);
+  __lsx_vst(reg0, tmp_odd_buf, 208);
+  __lsx_vst(reg1, tmp_odd_buf, 224);
+  DOTP_CONST_PAIR(reg3, reg2, -cospi_8_64, cospi_24_64, reg0, reg1);
+  __lsx_vst(reg0, tmp_odd_buf, 128);
+  __lsx_vst(reg1, tmp_odd_buf, 144);
+
+  /* Odd stage 3 : Dependency on Odd stage 1 & Odd stage 2 */
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 16, tmp_odd_buf, 32,
+            tmp_odd_buf, 48, reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 128, tmp_odd_buf, 144, tmp_odd_buf, 160,
+            tmp_odd_buf, 176, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 0);
+  __lsx_vst(loc1, tmp_odd_buf, 16);
+  __lsx_vst(loc2, tmp_odd_buf, 32);
+  __lsx_vst(loc3, tmp_odd_buf, 48);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg1, reg5, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg2, reg6, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 128);
+  __lsx_vst(loc1, tmp_odd_buf, 144);
+  __lsx_vst(loc2, tmp_odd_buf, 160);
+  __lsx_vst(loc3, tmp_odd_buf, 176);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 80, tmp_odd_buf, 96,
+            tmp_odd_buf, 112, reg1, reg2, reg0, reg3);
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 192, tmp_odd_buf, 208, tmp_odd_buf, 224,
+            tmp_odd_buf, 240, reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vadd_h, reg0, reg4, reg1, reg5, reg2, reg6, reg3, reg7, loc0,
+            loc1, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 64);
+  __lsx_vst(loc1, tmp_odd_buf, 80);
+  __lsx_vst(loc2, tmp_odd_buf, 96);
+  __lsx_vst(loc3, tmp_odd_buf, 112);
+
+  DUP2_ARG2(__lsx_vsub_h, reg0, reg4, reg3, reg7, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc0, loc1);
+  DUP2_ARG2(__lsx_vsub_h, reg1, reg5, reg2, reg6, vec0, vec1);
+  DOTP_CONST_PAIR(vec1, vec0, cospi_16_64, cospi_16_64, loc2, loc3);
+  __lsx_vst(loc0, tmp_odd_buf, 192);
+  __lsx_vst(loc1, tmp_odd_buf, 208);
+  __lsx_vst(loc2, tmp_odd_buf, 224);
+  __lsx_vst(loc3, tmp_odd_buf, 240);
+}
+
+static void idct8x32_column_butterfly_addblk(int16_t *tmp_eve_buf,
+                                             int16_t *tmp_odd_buf, uint8_t *dst,
+                                             int32_t dst_stride) {
+  __m128i vec0, vec1, vec2, vec3, loc0, loc1, loc2, loc3;
+  __m128i m0, m1, m2, m3, m4, m5, m6, m7;
+  __m128i n0, n1, n2, n3, n4, n5, n6, n7;
+  int32_t stride = dst_stride << 2;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride + stride2;
+
+  /* FINAL BUTTERFLY : Dependency on Even & Odd */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 0, tmp_odd_buf, 144, tmp_odd_buf, 224,
+            tmp_odd_buf, 96, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 0, tmp_eve_buf, 128, tmp_eve_buf, 64,
+            tmp_eve_buf, 192, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m0,
+            m4, m2, m6);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB(dst, stride, stride2, stride3, m0, m2, m4, m6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m6,
+            m2, m4, m0);
+  DUP4_ARG2(__lsx_vsrari_h, m0, 6, m2, 6, m4, 6, m6, 6, m0, m2, m4, m6);
+  VP9_ADDBLK_ST8x4_UB((dst + 19 * dst_stride), stride, stride2, stride3, m0, m2,
+                      m4, m6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 64, tmp_odd_buf, 208, tmp_odd_buf, 160,
+            tmp_odd_buf, 48, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 32, tmp_eve_buf, 160, tmp_eve_buf, 96,
+            tmp_eve_buf, 224, loc0, loc1, loc2, loc3);
+
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m1,
+            m5, m3, m7);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 2 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, m7,
+            m3, m5, m1);
+  DUP4_ARG2(__lsx_vsrari_h, m1, 6, m3, 6, m5, 6, m7, 6, m1, m3, m5, m7);
+  VP9_ADDBLK_ST8x4_UB((dst + 17 * dst_stride), stride, stride2, stride3, m1, m3,
+                      m5, m7);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 32, tmp_odd_buf, 176, tmp_odd_buf, 192,
+            tmp_odd_buf, 112, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 16, tmp_eve_buf, 144, tmp_eve_buf, 80,
+            tmp_eve_buf, 208, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n0,
+            n4, n2, n6);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + dst_stride), stride, stride2, stride3, n0, n2, n4,
+                      n6);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n6,
+            n2, n4, n0);
+  DUP4_ARG2(__lsx_vsrari_h, n0, 6, n2, 6, n4, 6, n6, 6, n0, n2, n4, n6);
+  VP9_ADDBLK_ST8x4_UB((dst + 18 * dst_stride), stride, stride2, stride3, n0, n2,
+                      n4, n6);
+
+  /* Load 8 & Store 8 */
+  DUP4_ARG2(__lsx_vld, tmp_odd_buf, 80, tmp_odd_buf, 240, tmp_odd_buf, 128,
+            tmp_odd_buf, 16, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vld, tmp_eve_buf, 48, tmp_eve_buf, 176, tmp_eve_buf, 112,
+            tmp_eve_buf, 240, loc0, loc1, loc2, loc3);
+  DUP4_ARG2(__lsx_vadd_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n1,
+            n5, n3, n7);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 3 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+  DUP4_ARG2(__lsx_vsub_h, loc0, vec3, loc1, vec2, loc2, vec1, loc3, vec0, n7,
+            n3, n5, n1);
+  DUP4_ARG2(__lsx_vsrari_h, n1, 6, n3, 6, n5, 6, n7, 6, n1, n3, n5, n7);
+  VP9_ADDBLK_ST8x4_UB((dst + 16 * dst_stride), stride, stride2, stride3, n1, n3,
+                      n5, n7);
+}
+
+static void idct8x32_1d_columns_addblk_lsx(int16_t *input, uint8_t *dst,
+                                           int32_t dst_stride) {
+  DECLARE_ALIGNED(32, int16_t, tmp_odd_buf[16 * 8]);
+  DECLARE_ALIGNED(32, int16_t, tmp_eve_buf[16 * 8]);
+
+  idct8x32_column_even_process_store(input, &tmp_eve_buf[0]);
+  idct8x32_column_odd_process_store(input, &tmp_odd_buf[0]);
+  idct8x32_column_butterfly_addblk(&tmp_eve_buf[0], &tmp_odd_buf[0], dst,
+                                   dst_stride);
+}
+
+void vpx_idct32x32_1024_add_lsx(const int16_t *input, uint8_t *dst,
+                                int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+
+  /* transform rows */
+  for (i = 0; i < 4; ++i) {
+    /* process 32 * 8 block */
+    idct32x8_1d_rows_lsx((input + (i << 8)), (out_ptr + (i << 8)));
+  }
+
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_34_add_lsx(const int16_t *input, uint8_t *dst,
+                              int32_t dst_stride) {
+  int32_t i;
+  DECLARE_ALIGNED(32, int16_t, out_arr[32 * 32]);
+  int16_t *out_ptr = out_arr;
+  __m128i zero = __lsx_vldi(0);
+
+  for (i = 32; i--;) {
+    __lsx_vst(zero, out_ptr, 0);
+    __lsx_vst(zero, out_ptr, 16);
+    __lsx_vst(zero, out_ptr, 32);
+    __lsx_vst(zero, out_ptr, 48);
+    out_ptr += 32;
+  }
+
+  out_ptr = out_arr;
+
+  /* rows: only upper-left 8x8 has non-zero coeff */
+  idct32x8_1d_rows_lsx(input, out_ptr);
+
+  /* transform columns */
+  for (i = 0; i < 4; ++i) {
+    /* process 8 * 32 block */
+    idct8x32_1d_columns_addblk_lsx((out_ptr + (i << 3)), (dst + (i << 3)),
+                                   dst_stride);
+  }
+}
+
+void vpx_idct32x32_1_add_lsx(const int16_t *input, uint8_t *dst,
+                             int32_t dst_stride) {
+  int32_t i;
+  int16_t out;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7, vec;
+
+  out = ROUND_POWER_OF_TWO((input[0] * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO((out * cospi_16_64), DCT_CONST_BITS);
+  out = ROUND_POWER_OF_TWO(out, 6);
+
+  vec = __lsx_vreplgr2vr_h(out);
+
+  for (i = 16; i--;) {
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    dst2 = __lsx_vldx(dst, dst_stride);
+    dst3 = __lsx_vldx(dst + 16, dst_stride);
+
+    UNPCK_UB_SH(dst0, res0, res4);
+    UNPCK_UB_SH(dst1, res1, res5);
+    UNPCK_UB_SH(dst2, res2, res6);
+    UNPCK_UB_SH(dst3, res3, res7);
+
+    DUP4_ARG2(__lsx_vadd_h, res0, vec, res1, vec, res2, vec, res3, vec, res0,
+              res1, res2, res3);
+    DUP4_ARG2(__lsx_vadd_h, res4, vec, res5, vec, res6, vec, res7, vec, res4,
+              res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res4, res0, 0, res5, res1, 0, res6, res2, 0,
+              res7, res3, 0, tmp0, tmp1, tmp2, tmp3);
+    __lsx_vst(tmp0, dst, 0);
+    __lsx_vst(tmp1, dst, 16);
+    dst += dst_stride;
+    __lsx_vst(tmp2, dst, 0);
+    __lsx_vst(tmp3, dst, 16);
+    dst += dst_stride;
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/intrapred_lsx.c b/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
new file mode 100644
index 0000000..f990211
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/intrapred_lsx.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static inline void intra_predict_dc_8x8_lsx(const uint8_t *src_top,
+                                            const uint8_t *src_left,
+                                            uint8_t *dst, int32_t dst_stride) {
+  uint64_t val0, val1;
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i store, sum_h, sum_w, sum_d;
+  __m128i src = { 0 };
+
+  val0 = *(const uint64_t *)src_top;
+  val1 = *(const uint64_t *)src_left;
+  DUP2_ARG3(__lsx_vinsgr2vr_d, src, val0, 0, src, val1, 1, src, src);
+  sum_h = __lsx_vhaddw_hu_bu(src, src);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 4);
+  store = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+  dst += dst_stride_x4;
+  __lsx_vstelm_d(store, dst, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x2, 0, 0);
+  __lsx_vstelm_d(store, dst + dst_stride_x3, 0, 0);
+}
+
+static inline void intra_predict_dc_16x16_lsx(const uint8_t *src_top,
+                                              const uint8_t *src_left,
+                                              uint8_t *dst,
+                                              int32_t dst_stride) {
+  int32_t dst_stride_x2 = dst_stride << 1;
+  int32_t dst_stride_x3 = dst_stride_x2 + dst_stride;
+  int32_t dst_stride_x4 = dst_stride << 2;
+  __m128i top, left, out;
+  __m128i sum_h, sum_top, sum_left;
+  __m128i sum_w;
+  __m128i sum_d;
+
+  DUP2_ARG2(__lsx_vld, src_top, 0, src_left, 0, top, left);
+  DUP2_ARG2(__lsx_vhaddw_hu_bu, top, top, left, left, sum_top, sum_left);
+  sum_h = __lsx_vadd_h(sum_top, sum_left);
+  sum_w = __lsx_vhaddw_wu_hu(sum_h, sum_h);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vpickev_w(sum_d, sum_d);
+  sum_d = __lsx_vhaddw_du_wu(sum_w, sum_w);
+  sum_w = __lsx_vsrari_w(sum_d, 5);
+  out = __lsx_vreplvei_b(sum_w, 0);
+
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+  dst += dst_stride_x4;
+  __lsx_vstx(out, dst, 0);
+  __lsx_vstx(out, dst, dst_stride);
+  __lsx_vstx(out, dst, dst_stride_x2);
+  __lsx_vstx(out, dst, dst_stride_x3);
+}
+
+void vpx_dc_predictor_8x8_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                              const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_8x8_lsx(above, left, dst, y_stride);
+}
+
+void vpx_dc_predictor_16x16_lsx(uint8_t *dst, ptrdiff_t y_stride,
+                                const uint8_t *above, const uint8_t *left) {
+  intra_predict_dc_16x16_lsx(above, left, dst, y_stride);
+}
diff --git a/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c b/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
new file mode 100644
index 0000000..0503df9
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/loopfilter_16_lsx.c
@@ -0,0 +1,1320 @@
+/*
+ * Copyright (c) 2022 Loongson Technology Corporation Limited
+ * Contributed by Hecai Yuan <yuanhecai@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+#include "vpx_ports/mem.h"
+
+#define LSX_LD_8(_src, _stride, _stride2, _stride3, _stride4, _in0, _in1, \
+                 _in2, _in3, _in4, _in5, _in6, _in7)                      \
+  do {                                                                    \
+    _in0 = __lsx_vld(_src, 0);                                            \
+    _in1 = __lsx_vldx(_src, _stride);                                     \
+    _in2 = __lsx_vldx(_src, _stride2);                                    \
+    _in3 = __lsx_vldx(_src, _stride3);                                    \
+    _src += _stride4;                                                     \
+    _in4 = __lsx_vld(_src, 0);                                            \
+    _in5 = __lsx_vldx(_src, _stride);                                     \
+    _in6 = __lsx_vldx(_src, _stride2);                                    \
+    _in7 = __lsx_vldx(_src, _stride3);                                    \
+  } while (0)
+
+#define LSX_ST_8(_dst0, _dst1, _dst2, _dst3, _dst4, _dst5, _dst6, _dst7, _dst, \
+                 _stride, _stride2, _stride3, _stride4)                        \
+  do {                                                                         \
+    __lsx_vst(_dst0, _dst, 0);                                                 \
+    __lsx_vstx(_dst1, _dst, _stride);                                          \
+    __lsx_vstx(_dst2, _dst, _stride2);                                         \
+    __lsx_vstx(_dst3, _dst, _stride3);                                         \
+    _dst += _stride4;                                                          \
+    __lsx_vst(_dst4, _dst, 0);                                                 \
+    __lsx_vstx(_dst5, _dst, _stride);                                          \
+    __lsx_vstx(_dst6, _dst, _stride2);                                         \
+    __lsx_vstx(_dst7, _dst, _stride3);                                         \
+  } while (0)
+
+static int32_t hz_lpf_t4_and_t8_16w(uint8_t *dst, int32_t stride,
+                                    uint8_t *filter48,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstx(p1_out, dst, -stride2);
+    __lsx_vstx(p0_out, dst, -stride);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vstx(q1_out, dst, stride);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  DUP4_ARG3(__lsx_vbitsel_v, p2, p2_filt8_l, flat, p1_out, p1_filt8_l, flat,
+            p0_out, p0_filt8_l, flat, q0_out, q0_filt8_l, flat, p2_out, p1_out,
+            p0_out, q0_out);
+  DUP2_ARG3(__lsx_vbitsel_v, q1_out, q1_filt8_l, flat, q2, q2_filt8_l, flat,
+            q1_out, q2_out);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static void hz_lpf_t16_16w(uint8_t *dst, int32_t stride, uint8_t *filter48) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp0 = dst - stride4;
+  uint8_t *dst_tmp1 = dst + stride4;
+
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_h, out_l;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+            -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+
+  p3 = __lsx_vld(dst_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp0, stride, dst_tmp0, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp0, stride3);
+
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  q4 = __lsx_vld(dst_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+  q7 = __lsx_vldx(dst_tmp1, stride3);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+  if (__lsx_bz_v(flat2)) {
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+    __lsx_vstx(p2, dst, -stride3);
+    __lsx_vstx(p1, dst, -stride2);
+    __lsx_vstx(p0, dst, -stride);
+    __lsx_vst(q0, dst, 0);
+    __lsx_vstx(q1, dst, stride);
+    __lsx_vstx(q2, dst, stride2);
+  } else {
+    dst = dst_tmp0 - stride3;
+
+    p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+    p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+    p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+    p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+    p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+    p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+    p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+    p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+    q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+    tmp0_l = p7_l_in << 3;
+    tmp0_l -= p7_l_in;
+    tmp0_l += p6_l_in;
+    tmp0_l += q0_l_in;
+    tmp1_l = p6_l_in + p5_l_in;
+    tmp1_l += p4_l_in;
+    tmp1_l += p3_l_in;
+    tmp1_l += p2_l_in;
+    tmp1_l += p1_l_in;
+    tmp1_l += p0_l_in;
+    tmp1_l += tmp0_l;
+
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+    p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+    p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+    p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+    p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+    p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+    p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+    p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+    q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+    tmp0_h = p7_h_in << 3;
+    tmp0_h -= p7_h_in;
+    tmp0_h += p6_h_in;
+    tmp0_h += q0_h_in;
+    tmp1_h = p6_h_in + p5_h_in;
+    tmp1_h += p4_h_in;
+    tmp1_h += p3_h_in;
+    tmp1_h += p2_h_in;
+    tmp1_h += p1_h_in;
+    tmp1_h += p0_h_in;
+    tmp1_h += tmp0_h;
+
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+    __lsx_vst(p6, dst, 0);
+    dst += stride;
+
+    /* p5 */
+    q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+    tmp0_l = p5_l_in - p6_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+    tmp0_h = p5_h_in - p6_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+    __lsx_vst(p5, dst, 0);
+    dst += stride;
+
+    /* p4 */
+    q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+    tmp0_l = p4_l_in - p5_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+    tmp0_h = p4_h_in - p5_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+    __lsx_vst(p4, dst, 0);
+    dst += stride;
+
+    /* p3 */
+    q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+    tmp0_l = p3_l_in - p4_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+    tmp0_h = p3_h_in - p4_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+    __lsx_vst(p3, dst, 0);
+    dst += stride;
+
+    /* p2 */
+    q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+    filter8 = __lsx_vld(filter48, 0);
+    tmp0_l = p2_l_in - p3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+    tmp0_h = p2_h_in - p3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p1 */
+    q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+    filter8 = __lsx_vld(filter48, 16);
+    tmp0_l = p1_l_in - p2_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+    tmp0_h = p1_h_in - p2_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* p0 */
+    q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+    filter8 = __lsx_vld(filter48, 32);
+    tmp0_l = p0_l_in - p1_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+    tmp0_h = p0_h_in - p1_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q0 */
+    q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+    filter8 = __lsx_vld(filter48, 48);
+    tmp0_l = q7_l_in - p0_l_in;
+    tmp0_l += q0_l_in;
+    tmp0_l -= p7_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+    tmp0_h = q7_h_in - p0_h_in;
+    tmp0_h += q0_h_in;
+    tmp0_h -= p7_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q1 */
+    filter8 = __lsx_vld(filter48, 64);
+    tmp0_l = q7_l_in - q0_l_in;
+    tmp0_l += q1_l_in;
+    tmp0_l -= p6_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q0_h_in;
+    tmp0_h += q1_h_in;
+    tmp0_h -= p6_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q2 */
+    filter8 = __lsx_vld(filter48, 80);
+    tmp0_l = q7_l_in - q1_l_in;
+    tmp0_l += q2_l_in;
+    tmp0_l -= p5_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q1_h_in;
+    tmp0_h += q2_h_in;
+    tmp0_h -= p5_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+    __lsx_vst(filter8, dst, 0);
+    dst += stride;
+
+    /* q3 */
+    tmp0_l = q7_l_in - q2_l_in;
+    tmp0_l += q3_l_in;
+    tmp0_l -= p4_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q2_h_in;
+    tmp0_h += q3_h_in;
+    tmp0_h -= p4_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+    __lsx_vst(q3, dst, 0);
+    dst += stride;
+
+    /* q4 */
+    tmp0_l = q7_l_in - q3_l_in;
+    tmp0_l += q4_l_in;
+    tmp0_l -= p3_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q3_h_in;
+    tmp0_h += q4_h_in;
+    tmp0_h -= p3_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+    __lsx_vst(q4, dst, 0);
+    dst += stride;
+
+    /* q5 */
+    tmp0_l = q7_l_in - q4_l_in;
+    tmp0_l += q5_l_in;
+    tmp0_l -= p2_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q4_h_in;
+    tmp0_h += q5_h_in;
+    tmp0_h -= p2_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+    __lsx_vst(q5, dst, 0);
+    dst += stride;
+
+    /* q6 */
+    tmp0_l = q7_l_in - q5_l_in;
+    tmp0_l += q6_l_in;
+    tmp0_l -= p1_l_in;
+    tmp1_l += tmp0_l;
+    out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+
+    tmp0_h = q7_h_in - q5_h_in;
+    tmp0_h += q6_h_in;
+    tmp0_h -= p1_h_in;
+    tmp1_h += tmp0_h;
+    out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+    out_l = __lsx_vpickev_b(out_h, out_l);
+    q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+    __lsx_vst(q6, dst, 0);
+  }
+}
+
+static void mb_lpf_horizontal_edge_dual(uint8_t *dst, int32_t stride,
+                                        const uint8_t *b_limit_ptr,
+                                        const uint8_t *limit_ptr,
+                                        const uint8_t *thresh_ptr) {
+  DECLARE_ALIGNED(16, uint8_t, filter48[16 * 8]);
+  uint8_t early_exit = 0;
+
+  early_exit = hz_lpf_t4_and_t8_16w(dst, stride, &filter48[0], b_limit_ptr,
+                                    limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    hz_lpf_t16_16w(dst, stride, filter48);
+  }
+}
+
+static void mb_lpf_horizontal_edge(uint8_t *dst, int32_t stride,
+                                   const uint8_t *b_limit_ptr,
+                                   const uint8_t *limit_ptr,
+                                   const uint8_t *thresh_ptr, int32_t count) {
+  if (count == 1) {
+    __m128i flat2, mask, hev, flat, thresh, b_limit, limit;
+    __m128i p3, p2, p1, p0, q3, q2, q1, q0, p7, p6, p5, p4, q4, q5, q6, q7;
+    __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+    __m128i p0_filter16, p1_filter16;
+    __m128i p2_filter8, p1_filter8, p0_filter8;
+    __m128i q0_filter8, q1_filter8, q2_filter8;
+    __m128i p7_l, p6_l, p5_l, p4_l, q7_l, q6_l, q5_l, q4_l;
+    __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+    __m128i zero = __lsx_vldi(0);
+    __m128i tmp0, tmp1, tmp2;
+
+    int32_t stride2 = stride << 1;
+    int32_t stride3 = 2 + stride;
+    int32_t stride4 = stride << 2;
+    uint8_t *dst_tmp0 = dst - stride4;
+    uint8_t *dst_tmp1 = dst + stride4;
+
+    /* load vector elements */
+    DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+              -stride, p3, p2, p1, p0);
+    q0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+    q3 = __lsx_vldx(dst, stride3);
+
+    thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+    b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+    limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+    /* filter_mask* */
+    LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+                 mask, flat);
+    VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+    VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out,
+                       q1_out);
+    flat = __lsx_vilvl_d(zero, flat);
+    if (__lsx_bz_v(flat)) {
+      __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+      __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+      __lsx_vstelm_d(q0_out, dst, 0, 0);
+      __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+    } else {
+      /* convert 8 bit input data into 16 bit */
+      DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l,
+                p2_l, p1_l, p0_l);
+      DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l,
+                q1_l, q2_l, q3_l);
+      VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                  p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+      /* convert 16 bit output data into 8 bit */
+      DUP4_ARG2(__lsx_vpickev_b, zero, p2_filter8, zero, p1_filter8, zero,
+                p0_filter8, zero, q0_filter8, p2_filter8, p1_filter8,
+                p0_filter8, q0_filter8);
+      DUP2_ARG2(__lsx_vpickev_b, zero, q1_filter8, zero, q2_filter8, q1_filter8,
+                q2_filter8);
+
+      /* store pixel values */
+      p2_out = __lsx_vbitsel_v(p2, p2_filter8, flat);
+      p1_out = __lsx_vbitsel_v(p1_out, p1_filter8, flat);
+      p0_out = __lsx_vbitsel_v(p0_out, p0_filter8, flat);
+      q0_out = __lsx_vbitsel_v(q0_out, q0_filter8, flat);
+      q1_out = __lsx_vbitsel_v(q1_out, q1_filter8, flat);
+      q2_out = __lsx_vbitsel_v(q2, q2_filter8, flat);
+
+      /* load 16 vector elements */
+      DUP4_ARG2(__lsx_vldx, dst_tmp0, -stride4, dst_tmp0, -stride3, dst_tmp0,
+                -stride2, dst_tmp0, -stride, p7, p6, p5, p4);
+      q4 = __lsx_vld(dst_tmp1, 0);
+      DUP2_ARG2(__lsx_vldx, dst_tmp1, stride, dst_tmp1, stride2, q5, q6);
+      q7 = __lsx_vldx(dst_tmp1, stride3);
+
+      VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+
+      if (__lsx_bz_v(flat2)) {
+        dst -= stride3;
+        __lsx_vstelm_d(p2_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q0_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q1_out, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(q2_out, dst, 0, 0);
+      } else {
+        /* LSB(right) 8 pixel operation */
+        DUP4_ARG2(__lsx_vilvl_b, zero, p7, zero, p6, zero, p5, zero, p4, p7_l,
+                  p6_l, p5_l, p4_l);
+        DUP4_ARG2(__lsx_vilvl_b, zero, q4, zero, q5, zero, q6, zero, q7, q4_l,
+                  q5_l, q6_l, q7_l);
+
+        tmp0 = __lsx_vslli_h(p7_l, 3);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp0 = __lsx_vadd_h(tmp0, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q0_l);
+
+        dst = dst_tmp0 - stride3;
+
+        /* calculation of p6 and p5 */
+        tmp1 = __lsx_vadd_h(p6_l, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, p4_l);
+        tmp1 = __lsx_vadd_h(tmp1, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, p2_l);
+        tmp1 = __lsx_vadd_h(tmp1, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, p0_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp0 = __lsx_vsub_h(p5_l, p6_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p6, p0_filter16, flat2, p5, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p4 and p3 */
+        tmp0 = __lsx_vsub_h(p4_l, p5_l);
+        tmp0 = __lsx_vadd_h(tmp0, q2_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p3_l, p4_l);
+        tmp2 = __lsx_vadd_h(tmp2, q3_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p4, p0_filter16, flat2, p3, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p2 and p1 */
+        tmp0 = __lsx_vsub_h(p2_l, p3_l);
+        tmp0 = __lsx_vadd_h(tmp0, q4_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(p1_l, p2_l);
+        tmp2 = __lsx_vadd_h(tmp2, q5_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p2_out, p0_filter16, flat2, p1_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of p0 and q0 */
+        tmp0 = __lsx_vsub_h(p0_l, p1_l);
+        tmp0 = __lsx_vadd_h(tmp0, q6_l);
+        tmp0 = __lsx_vsub_h(tmp0, p7_l);
+        tmp2 = __lsx_vsub_h(q7_l, p0_l);
+        tmp2 = __lsx_vadd_h(tmp2, q0_l);
+        tmp2 = __lsx_vsub_h(tmp2, p7_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, p0_out, p0_filter16, flat2, q0_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q1 and q2 */
+        tmp0 = __lsx_vsub_h(q7_l, q0_l);
+        tmp0 = __lsx_vadd_h(tmp0, q1_l);
+        tmp0 = __lsx_vsub_h(tmp0, p6_l);
+        tmp2 = __lsx_vsub_h(q7_l, q1_l);
+        tmp2 = __lsx_vadd_h(tmp2, q2_l);
+        tmp2 = __lsx_vsub_h(tmp2, p5_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q1_out, p0_filter16, flat2, q2_out,
+                  p1_filter16, flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q3 and q4 */
+        tmp0 = __lsx_vsub_h(q7_l, q2_l);
+        tmp0 = __lsx_vadd_h(tmp0, q3_l);
+        tmp0 = __lsx_vsub_h(tmp0, p4_l);
+        tmp2 = __lsx_vsub_h(q7_l, q3_l);
+        tmp2 = __lsx_vadd_h(tmp2, q4_l);
+        tmp2 = __lsx_vsub_h(tmp2, p3_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q3, p0_filter16, flat2, q4, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+        dst += stride;
+
+        /* calculation of q5 and q6 */
+        tmp0 = __lsx_vsub_h(q7_l, q4_l);
+        tmp0 = __lsx_vadd_h(tmp0, q5_l);
+        tmp0 = __lsx_vsub_h(tmp0, p2_l);
+        tmp2 = __lsx_vsub_h(q7_l, q5_l);
+        tmp2 = __lsx_vadd_h(tmp2, q6_l);
+        tmp2 = __lsx_vsub_h(tmp2, p1_l);
+        tmp1 = __lsx_vadd_h(tmp1, tmp0);
+        p0_filter16 = __lsx_vsrari_h(tmp1, 4);
+        tmp1 = __lsx_vadd_h(tmp1, tmp2);
+        p1_filter16 = __lsx_vsrari_h(tmp1, 4);
+        DUP2_ARG2(__lsx_vpickev_b, zero, p0_filter16, zero, p1_filter16,
+                  p0_filter16, p1_filter16);
+        DUP2_ARG3(__lsx_vbitsel_v, q5, p0_filter16, flat2, q6, p1_filter16,
+                  flat2, p0_filter16, p1_filter16);
+        __lsx_vstelm_d(p0_filter16, dst, 0, 0);
+        dst += stride;
+        __lsx_vstelm_d(p1_filter16, dst, 0, 0);
+      }
+    }
+  } else {
+    mb_lpf_horizontal_edge_dual(dst, stride, b_limit_ptr, limit_ptr,
+                                thresh_ptr);
+  }
+}
+
+void vpx_lpf_horizontal_16_dual_lsx(uint8_t *dst, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  mb_lpf_horizontal_edge(dst, stride, b_limit_ptr, limit_ptr, thresh_ptr, 2);
+}
+
+static void transpose_16x16(uint8_t *input, int32_t in_stride, uint8_t *output,
+                            int32_t out_stride) {
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp2, tmp3;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  int32_t in_stride2 = in_stride << 1;
+  int32_t in_stride3 = in_stride2 + in_stride;
+  int32_t in_stride4 = in_stride2 << 1;
+  int32_t out_stride2 = out_stride << 1;
+  int32_t out_stride3 = out_stride2 + out_stride;
+  int32_t out_stride4 = out_stride2 << 1;
+
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row0, row1,
+           row2, row3, row4, row5, row6, row7);
+  input += in_stride4;
+  LSX_LD_8(input, in_stride, in_stride2, in_stride3, in_stride4, row8, row9,
+           row10, row11, row12, row13, row14, row15);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p7, p6,
+                      p5, p4, p3, p2, p1, p0);
+
+  /* transpose 16x8 matrix into 8x16 */
+  /* total 8 intermediate register and 32 instructions */
+  q7 = __lsx_vpackod_d(row8, row0);
+  q6 = __lsx_vpackod_d(row9, row1);
+  q5 = __lsx_vpackod_d(row10, row2);
+  q4 = __lsx_vpackod_d(row11, row3);
+  q3 = __lsx_vpackod_d(row12, row4);
+  q2 = __lsx_vpackod_d(row13, row5);
+  q1 = __lsx_vpackod_d(row14, row6);
+  q0 = __lsx_vpackod_d(row15, row7);
+
+  DUP2_ARG2(__lsx_vpackev_b, q6, q7, q4, q5, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vpackod_b, q6, q7, q4, q5, tmp4, tmp5);
+
+  DUP2_ARG2(__lsx_vpackev_b, q2, q3, q0, q1, q5, q7);
+  DUP2_ARG2(__lsx_vpackod_b, q2, q3, q0, q1, tmp6, tmp7);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp1, tmp0, q7, q5, tmp2, tmp3);
+  q0 = __lsx_vpackev_w(tmp3, tmp2);
+  q4 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp1, tmp0);
+  tmp3 = __lsx_vpackod_h(q7, q5);
+  q2 = __lsx_vpackev_w(tmp3, tmp2);
+  q6 = __lsx_vpackod_w(tmp3, tmp2);
+
+  DUP2_ARG2(__lsx_vpackev_h, tmp5, tmp4, tmp7, tmp6, tmp2, tmp3);
+  q1 = __lsx_vpackev_w(tmp3, tmp2);
+  q5 = __lsx_vpackod_w(tmp3, tmp2);
+
+  tmp2 = __lsx_vpackod_h(tmp5, tmp4);
+  tmp3 = __lsx_vpackod_h(tmp7, tmp6);
+  q3 = __lsx_vpackev_w(tmp3, tmp2);
+  q7 = __lsx_vpackod_w(tmp3, tmp2);
+
+  LSX_ST_8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+  output += out_stride4;
+  LSX_ST_8(q0, q1, q2, q3, q4, q5, q6, q7, output, out_stride, out_stride2,
+           out_stride3, out_stride4);
+}
+
+static int32_t vt_lpf_t4_and_t8_16w(uint8_t *dst, uint8_t *filter48,
+                                    uint8_t *dst_org, int32_t stride,
+                                    const uint8_t *b_limit_ptr,
+                                    const uint8_t *limit_ptr,
+                                    const uint8_t *thresh_ptr) {
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vld, dst, -64, dst, -48, dst, -32, dst, -16, p3, p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec2 = __lsx_vilvl_h(vec1, vec0);
+    vec3 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, vec0, vec1);
+    vec4 = __lsx_vilvl_h(vec1, vec0);
+    vec5 = __lsx_vilvh_h(vec1, vec0);
+
+    dst_org -= 2;
+    __lsx_vstelm_w(vec2, dst_org, 0, 0);
+    __lsx_vstelm_w(vec2, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec2, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec2, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_w(vec3, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec3, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec3, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_w(vec4, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec4, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec4, dst_org + stride3, 0, 3);
+    dst_org += stride4;
+    __lsx_vstelm_w(vec5, dst_org, 0, 0);
+    __lsx_vstelm_w(vec5, dst_org + stride, 0, 1);
+    __lsx_vstelm_w(vec5, dst_org + stride2, 0, 2);
+    __lsx_vstelm_w(vec5, dst_org + stride3, 0, 3);
+
+    return 1;
+  }
+
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+            p0_l);
+  DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+            q3_l);
+  VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+  DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+  DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+  VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+              p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+  /* convert 16 bit output data into 8 bit */
+  DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+            p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+            p1_filt8_l, p0_filt8_l, q0_filt8_l);
+  DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+            q1_filt8_l, q2_filt8_l);
+
+  /* store pixel values */
+  p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+  p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+  p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+  q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+  q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+  q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+  __lsx_vst(p2_out, filter48, 0);
+  __lsx_vst(p1_out, filter48, 16);
+  __lsx_vst(p0_out, filter48, 32);
+  __lsx_vst(q0_out, filter48, 48);
+  __lsx_vst(q1_out, filter48, 64);
+  __lsx_vst(q2_out, filter48, 80);
+  __lsx_vst(flat, filter48, 96);
+
+  return 0;
+}
+
+static int32_t vt_lpf_t16_16w(uint8_t *dst, uint8_t *dst_org, int32_t stride,
+                              uint8_t *filter48) {
+  __m128i flat, flat2, filter8;
+  __m128i p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+  __m128i out_l, out_h;
+  v8u16 p7_l_in, p6_l_in, p5_l_in, p4_l_in;
+  v8u16 p3_l_in, p2_l_in, p1_l_in, p0_l_in;
+  v8u16 q7_l_in, q6_l_in, q5_l_in, q4_l_in;
+  v8u16 q3_l_in, q2_l_in, q1_l_in, q0_l_in;
+  v8u16 p7_h_in, p6_h_in, p5_h_in, p4_h_in;
+  v8u16 p3_h_in, p2_h_in, p1_h_in, p0_h_in;
+  v8u16 q7_h_in, q6_h_in, q5_h_in, q4_h_in;
+  v8u16 q3_h_in, q2_h_in, q1_h_in, q0_h_in;
+  v8u16 tmp0_l, tmp1_l, tmp0_h, tmp1_h;
+  uint8_t *dst_tmp = dst - 128;
+
+  flat = __lsx_vld(filter48, 96);
+
+  DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48, p7,
+            p6, p5, p4);
+  DUP4_ARG2(__lsx_vld, dst_tmp, 64, dst_tmp, 80, dst_tmp, 96, dst_tmp, 112, p3,
+            p2, p1, p0);
+  DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, q0, q1, q2, q3);
+  DUP4_ARG2(__lsx_vld, dst, 64, dst, 80, dst, 96, dst, 112, q4, q5, q6, q7);
+
+  VP9_FLAT5(p7, p6, p5, p4, p0, q0, q4, q5, q6, q7, flat, flat2);
+  /* if flat2 is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat2)) {
+    __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+
+    DUP4_ARG2(__lsx_vld, filter48, 0, filter48, 16, filter48, 32, filter48, 48,
+              p2, p1, p0, q0);
+    DUP2_ARG2(__lsx_vld, filter48, 64, filter48, 80, q1, q2);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, vec0, vec1);
+    vec3 = __lsx_vilvl_h(vec1, vec0);
+    vec4 = __lsx_vilvh_h(vec1, vec0);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, vec0, vec1);
+    vec6 = __lsx_vilvl_h(vec1, vec0);
+    vec7 = __lsx_vilvh_h(vec1, vec0);
+    vec2 = __lsx_vilvl_b(q2, q1);
+    vec5 = __lsx_vilvh_b(q2, q1);
+
+    dst_org -= 3;
+    __lsx_vstelm_w(vec3, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec3, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 0);
+    __lsx_vstelm_h(vec2, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 1);
+    __lsx_vstelm_h(vec2, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 2);
+    __lsx_vstelm_h(vec2, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec4, dst_org, 0, 3);
+    __lsx_vstelm_h(vec2, dst_org, 4, 7);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 0);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 1);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 2);
+    dst_org += stride;
+    __lsx_vstelm_w(vec6, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 3);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 0);
+    __lsx_vstelm_h(vec5, dst_org, 4, 4);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 1);
+    __lsx_vstelm_h(vec5, dst_org, 4, 5);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 2);
+    __lsx_vstelm_h(vec5, dst_org, 4, 6);
+    dst_org += stride;
+    __lsx_vstelm_w(vec7, dst_org, 0, 3);
+    __lsx_vstelm_h(vec5, dst_org, 4, 7);
+
+    return 1;
+  }
+
+  dst -= 7 * 16;
+
+  p7_l_in = (v8u16)__lsx_vsllwil_hu_bu(p7, 0);
+  p6_l_in = (v8u16)__lsx_vsllwil_hu_bu(p6, 0);
+  p5_l_in = (v8u16)__lsx_vsllwil_hu_bu(p5, 0);
+  p4_l_in = (v8u16)__lsx_vsllwil_hu_bu(p4, 0);
+  p3_l_in = (v8u16)__lsx_vsllwil_hu_bu(p3, 0);
+  p2_l_in = (v8u16)__lsx_vsllwil_hu_bu(p2, 0);
+  p1_l_in = (v8u16)__lsx_vsllwil_hu_bu(p1, 0);
+  p0_l_in = (v8u16)__lsx_vsllwil_hu_bu(p0, 0);
+  q0_l_in = (v8u16)__lsx_vsllwil_hu_bu(q0, 0);
+
+  tmp0_l = p7_l_in << 3;
+  tmp0_l -= p7_l_in;
+  tmp0_l += p6_l_in;
+  tmp0_l += q0_l_in;
+  tmp1_l = p6_l_in + p5_l_in;
+  tmp1_l += p4_l_in;
+  tmp1_l += p3_l_in;
+  tmp1_l += p2_l_in;
+  tmp1_l += p1_l_in;
+  tmp1_l += p0_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  p7_h_in = (v8u16)__lsx_vexth_hu_bu(p7);
+  p6_h_in = (v8u16)__lsx_vexth_hu_bu(p6);
+  p5_h_in = (v8u16)__lsx_vexth_hu_bu(p5);
+  p4_h_in = (v8u16)__lsx_vexth_hu_bu(p4);
+  p3_h_in = (v8u16)__lsx_vexth_hu_bu(p3);
+  p2_h_in = (v8u16)__lsx_vexth_hu_bu(p2);
+  p1_h_in = (v8u16)__lsx_vexth_hu_bu(p1);
+  p0_h_in = (v8u16)__lsx_vexth_hu_bu(p0);
+  q0_h_in = (v8u16)__lsx_vexth_hu_bu(q0);
+
+  tmp0_h = p7_h_in << 3;
+  tmp0_h -= p7_h_in;
+  tmp0_h += p6_h_in;
+  tmp0_h += q0_h_in;
+  tmp1_h = p6_h_in + p5_h_in;
+  tmp1_h += p4_h_in;
+  tmp1_h += p3_h_in;
+  tmp1_h += p2_h_in;
+  tmp1_h += p1_h_in;
+  tmp1_h += p0_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p6 = __lsx_vbitsel_v(p6, out_l, flat2);
+  __lsx_vst(p6, dst, 0);
+
+  /* p5 */
+  q1_l_in = (v8u16)__lsx_vsllwil_hu_bu(q1, 0);
+  tmp0_l = p5_l_in - p6_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q1_h_in = (v8u16)__lsx_vexth_hu_bu(q1);
+  tmp0_h = p5_h_in - p6_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p5 = __lsx_vbitsel_v(p5, out_l, flat2);
+  __lsx_vst(p5, dst, 16);
+
+  /* p4 */
+  q2_l_in = (v8u16)__lsx_vsllwil_hu_bu(q2, 0);
+  tmp0_l = p4_l_in - p5_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q2_h_in = (v8u16)__lsx_vexth_hu_bu(q2);
+  tmp0_h = p4_h_in - p5_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p4 = __lsx_vbitsel_v(p4, out_l, flat2);
+  __lsx_vst(p4, dst, 16 * 2);
+
+  /* p3 */
+  q3_l_in = (v8u16)__lsx_vsllwil_hu_bu(q3, 0);
+  tmp0_l = p3_l_in - p4_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q3_h_in = (v8u16)__lsx_vexth_hu_bu(q3);
+  tmp0_h = p3_h_in - p4_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  p3 = __lsx_vbitsel_v(p3, out_l, flat2);
+  __lsx_vst(p3, dst, 16 * 3);
+
+  /* p2 */
+  q4_l_in = (v8u16)__lsx_vsllwil_hu_bu(q4, 0);
+  filter8 = __lsx_vld(filter48, 0);
+  tmp0_l = p2_l_in - p3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q4_h_in = (v8u16)__lsx_vexth_hu_bu(q4);
+  tmp0_h = p2_h_in - p3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 4);
+
+  /* p1 */
+  q5_l_in = (v8u16)__lsx_vsllwil_hu_bu(q5, 0);
+  filter8 = __lsx_vld(filter48, 16);
+  tmp0_l = p1_l_in - p2_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q5_h_in = (v8u16)__lsx_vexth_hu_bu(q5);
+  tmp0_h = p1_h_in - p2_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)(tmp1_h), 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 5);
+
+  /* p0 */
+  q6_l_in = (v8u16)__lsx_vsllwil_hu_bu(q6, 0);
+  filter8 = __lsx_vld(filter48, 32);
+  tmp0_l = p0_l_in - p1_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q6_h_in = (v8u16)__lsx_vexth_hu_bu(q6);
+  tmp0_h = p0_h_in - p1_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 6);
+
+  /* q0 */
+  q7_l_in = (v8u16)__lsx_vsllwil_hu_bu(q7, 0);
+  filter8 = __lsx_vld(filter48, 48);
+  tmp0_l = q7_l_in - p0_l_in;
+  tmp0_l += q0_l_in;
+  tmp0_l -= p7_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  q7_h_in = (v8u16)__lsx_vexth_hu_bu(q7);
+  tmp0_h = q7_h_in - p0_h_in;
+  tmp0_h += q0_h_in;
+  tmp0_h -= p7_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 7);
+
+  /* q1 */
+  filter8 = __lsx_vld(filter48, 64);
+  tmp0_l = q7_l_in - q0_l_in;
+  tmp0_l += q1_l_in;
+  tmp0_l -= p6_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q0_h_in;
+  tmp0_h += q1_h_in;
+  tmp0_h -= p6_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 8);
+
+  /* q2 */
+  filter8 = __lsx_vld(filter48, 80);
+  tmp0_l = q7_l_in - q1_l_in;
+  tmp0_l += q2_l_in;
+  tmp0_l -= p5_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q1_h_in;
+  tmp0_h += q2_h_in;
+  tmp0_h -= p5_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  filter8 = __lsx_vbitsel_v(filter8, out_l, flat2);
+  __lsx_vst(filter8, dst, 16 * 9);
+
+  /* q3 */
+  tmp0_l = q7_l_in - q2_l_in;
+  tmp0_l += q3_l_in;
+  tmp0_l -= p4_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q2_h_in;
+  tmp0_h += q3_h_in;
+  tmp0_h -= p4_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q3 = __lsx_vbitsel_v(q3, out_l, flat2);
+  __lsx_vst(q3, dst, 16 * 10);
+
+  /* q4 */
+  tmp0_l = q7_l_in - q3_l_in;
+  tmp0_l += q4_l_in;
+  tmp0_l -= p3_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q3_h_in;
+  tmp0_h += q4_h_in;
+  tmp0_h -= p3_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q4 = __lsx_vbitsel_v(q4, out_l, flat2);
+  __lsx_vst(q4, dst, 16 * 11);
+
+  /* q5 */
+  tmp0_l = q7_l_in - q4_l_in;
+  tmp0_l += q5_l_in;
+  tmp0_l -= p2_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q4_h_in;
+  tmp0_h += q5_h_in;
+  tmp0_h -= p2_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q5 = __lsx_vbitsel_v(q5, out_l, flat2);
+  __lsx_vst(q5, dst, 16 * 12);
+
+  /* q6 */
+  tmp0_l = q7_l_in - q5_l_in;
+  tmp0_l += q6_l_in;
+  tmp0_l -= p1_l_in;
+  tmp1_l += tmp0_l;
+  out_l = __lsx_vsrari_h((__m128i)tmp1_l, 4);
+  tmp0_h = q7_h_in - q5_h_in;
+  tmp0_h += q6_h_in;
+  tmp0_h -= p1_h_in;
+  tmp1_h += tmp0_h;
+  out_h = __lsx_vsrari_h((__m128i)tmp1_h, 4);
+  out_l = __lsx_vpickev_b(out_h, out_l);
+  q6 = __lsx_vbitsel_v(q6, out_l, flat2);
+  __lsx_vst(q6, dst, 16 * 13);
+
+  return 0;
+}
+
+void vpx_lpf_vertical_16_dual_lsx(uint8_t *src, int32_t pitch,
+                                  const uint8_t *b_limit_ptr,
+                                  const uint8_t *limit_ptr,
+                                  const uint8_t *thresh_ptr) {
+  uint8_t early_exit = 0;
+  DECLARE_ALIGNED(16, uint8_t, transposed_input[16 * 24]);
+  uint8_t *filter48 = &transposed_input[16 * 16];
+
+  transpose_16x16((src - 8), pitch, &transposed_input[0], 16);
+
+  early_exit =
+      vt_lpf_t4_and_t8_16w((transposed_input + 16 * 8), &filter48[0], src,
+                           pitch, b_limit_ptr, limit_ptr, thresh_ptr);
+
+  if (early_exit == 0) {
+    early_exit =
+        vt_lpf_t16_16w((transposed_input + 16 * 8), src, pitch, &filter48[0]);
+
+    if (early_exit == 0) {
+      transpose_16x16(transposed_input, 16, (src - 8), pitch);
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c b/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
new file mode 100644
index 0000000..9300b5c
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/loopfilter_4_lsx.c
@@ -0,0 +1,214 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_4_lsx(uint8_t *src, int32_t pitch,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0, p1_out, p0_out, q0_out, q1_out;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  __lsx_vstelm_d(p1_out, src - pitch2, 0, 0);
+  __lsx_vstelm_d(p0_out, src - pitch, 0, 0);
+  __lsx_vstelm_d(q0_out, src, 0, 0);
+  __lsx_vstelm_d(q1_out, src + pitch, 0, 0);
+}
+
+void vpx_lpf_horizontal_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                   const uint8_t *b_limit0_ptr,
+                                   const uint8_t *limit0_ptr,
+                                   const uint8_t *thresh0_ptr,
+                                   const uint8_t *b_limit1_ptr,
+                                   const uint8_t *limit1_ptr,
+                                   const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat, thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, src, -pitch4, src, -pitch3, src, -pitch2, src, -pitch,
+            p3, p2, p1, p0);
+  q0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, pitch, src, pitch2, q1, q2);
+  q3 = __lsx_vldx(src, pitch3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+
+  __lsx_vstx(p1, src, -pitch2);
+  __lsx_vstx(p0, src, -pitch);
+  __lsx_vst(q0, src, 0);
+  __lsx_vstx(q1, src, pitch);
+}
+
+void vpx_lpf_vertical_4_lsx(uint8_t *src, int32_t pitch,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, limit, thresh, b_limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  p3 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, p2, p1);
+  p0 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  q0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, q1, q2);
+  q3 = __lsx_vldx(src_tmp, pitch3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, vec0, vec1);
+  vec2 = __lsx_vilvl_h(vec1, vec0);
+  vec3 = __lsx_vilvh_h(vec1, vec0);
+
+  src -= 2;
+  __lsx_vstelm_w(vec2, src, 0, 0);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 1);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 2);
+  src += pitch;
+  __lsx_vstelm_w(vec2, src, 0, 3);
+  src += pitch;
+
+  __lsx_vstelm_w(vec3, src, 0, 0);
+  __lsx_vstelm_w(vec3, src + pitch, 0, 1);
+  __lsx_vstelm_w(vec3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(vec3, src + pitch3, 0, 3);
+}
+
+void vpx_lpf_vertical_4_dual_lsx(uint8_t *src, int32_t pitch,
+                                 const uint8_t *b_limit0_ptr,
+                                 const uint8_t *limit0_ptr,
+                                 const uint8_t *thresh0_ptr,
+                                 const uint8_t *b_limit1_ptr,
+                                 const uint8_t *limit1_ptr,
+                                 const uint8_t *thresh1_ptr) {
+  __m128i mask, hev, flat;
+  __m128i thresh0, b_limit0, limit0, thresh1, b_limit1, limit1;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i row0, row1, row2, row3, row4, row5, row6, row7;
+  __m128i row8, row9, row10, row11, row12, row13, row14, row15;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  int32_t pitch2 = pitch << 1;
+  int32_t pitch3 = pitch2 + pitch;
+  int32_t pitch4 = pitch2 << 1;
+  uint8_t *src_tmp = src - 4;
+
+  row0 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row1, row2);
+  row3 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row4 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row5, row6);
+  row7 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row8 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row9, row10);
+  row11 = __lsx_vldx(src_tmp, pitch3);
+  src_tmp += pitch4;
+  row12 = __lsx_vld(src_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp, pitch, src_tmp, pitch2, row13, row14);
+  row15 = __lsx_vldx(src_tmp, pitch3);
+
+  LSX_TRANSPOSE16x8_B(row0, row1, row2, row3, row4, row5, row6, row7, row8,
+                      row9, row10, row11, row12, row13, row14, row15, p3, p2,
+                      p1, p0, q0, q1, q2, q3);
+
+  thresh0 = __lsx_vldrepl_b(thresh0_ptr, 0);
+  thresh1 = __lsx_vldrepl_b(thresh1_ptr, 0);
+  thresh0 = __lsx_vilvl_d(thresh1, thresh0);
+
+  b_limit0 = __lsx_vldrepl_b(b_limit0_ptr, 0);
+  b_limit1 = __lsx_vldrepl_b(b_limit1_ptr, 0);
+  b_limit0 = __lsx_vilvl_d(b_limit1, b_limit0);
+
+  limit0 = __lsx_vldrepl_b(limit0_ptr, 0);
+  limit1 = __lsx_vldrepl_b(limit1_ptr, 0);
+  limit0 = __lsx_vilvl_d(limit1, limit0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit0, b_limit0, thresh0, hev,
+               mask, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1, p0, q0, q1);
+  DUP2_ARG2(__lsx_vilvl_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp2 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp3 = __lsx_vilvh_h(tmp1, tmp0);
+  DUP2_ARG2(__lsx_vilvh_b, p0, p1, q1, q0, tmp0, tmp1);
+  tmp4 = __lsx_vilvl_h(tmp1, tmp0);
+  tmp5 = __lsx_vilvh_h(tmp1, tmp0);
+
+  src -= 2;
+  __lsx_vstelm_w(tmp2, src, 0, 0);
+  __lsx_vstelm_w(tmp2, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp2, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp2, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp3, src, 0, 0);
+  __lsx_vstelm_w(tmp3, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp3, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp3, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp4, src, 0, 0);
+  __lsx_vstelm_w(tmp4, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp4, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp4, src + pitch3, 0, 3);
+  src += pitch4;
+  __lsx_vstelm_w(tmp5, src, 0, 0);
+  __lsx_vstelm_w(tmp5, src + pitch, 0, 1);
+  __lsx_vstelm_w(tmp5, src + pitch2, 0, 2);
+  __lsx_vstelm_w(tmp5, src + pitch3, 0, 3);
+}
diff --git a/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c b/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
new file mode 100644
index 0000000..00219ba
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/loopfilter_8_lsx.c
@@ -0,0 +1,458 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/loopfilter_lsx.h"
+
+void vpx_lpf_horizontal_8_lsx(uint8_t *dst, int32_t stride,
+                              const uint8_t *b_limit_ptr,
+                              const uint8_t *limit_ptr,
+                              const uint8_t *thresh_ptr) {
+  __m128i mask, hev, flat, thresh, b_limit, limit;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out;
+  __m128i p2_filter8, p1_filter8, p0_filter8;
+  __m128i q0_filter8, q1_filter8, q2_filter8;
+  __m128i p3_l, p2_l, p1_l, p0_l, q3_l, q2_l, q1_l, q0_l;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  /* load vector elements */
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(flat, flat);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vstelm_d(p1_out, dst - stride2, 0, 0);
+    __lsx_vstelm_d(p0_out, dst - stride, 0, 0);
+    __lsx_vstelm_d(q0_out, dst, 0, 0);
+    __lsx_vstelm_d(q1_out, dst + stride, 0, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filter8,
+                p1_filter8, p0_filter8, q0_filter8, q1_filter8, q2_filter8);
+
+    DUP2_ARG2(__lsx_vpickev_b, p1_filter8, p2_filter8, q0_filter8, p0_filter8,
+              p1_filter8, q0_filter8);
+    q2_filter8 = __lsx_vpickev_b(q2_filter8, q1_filter8);
+
+    p2 = __lsx_vilvl_d(p1_out, p2);
+    p0_out = __lsx_vilvl_d(q0_out, p0_out);
+    q1_out = __lsx_vilvl_d(q2, q1_out);
+
+    DUP2_ARG3(__lsx_vbitsel_v, p2, p1_filter8, flat, p0_out, q0_filter8, flat,
+              p2_out, p1_out);
+    p0_out = __lsx_vbitsel_v(q1_out, q2_filter8, flat);
+    dst -= stride3;
+
+    __lsx_vstelm_d(p2_out, dst, 0, 0);
+    __lsx_vstelm_d(p2_out, dst + stride, 0, 1);
+    __lsx_vstelm_d(p1_out, dst + stride2, 0, 0);
+    __lsx_vstelm_d(p1_out, dst + stride3, 0, 1);
+
+    dst += stride4;
+    __lsx_vstelm_d(p0_out, dst, 0, 0);
+    dst += stride;
+    __lsx_vstelm_d(p0_out, dst, 0, 1);
+  }
+}
+
+void vpx_lpf_horizontal_8_dual_lsx(
+    uint8_t *dst, int32_t stride, const uint8_t *b_limit0,
+    const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *b_limit1,
+    const uint8_t *limit1, const uint8_t *thresh1) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p2_out, p1_out, p0_out, q0_out, q1_out, q2_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  DUP4_ARG2(__lsx_vldx, dst, -stride4, dst, -stride3, dst, -stride2, dst,
+            -stride, p3, p2, p1, p0);
+  q0 = __lsx_vld(dst, 0);
+  DUP2_ARG2(__lsx_vldx, dst, stride, dst, stride2, q1, q2);
+  q3 = __lsx_vldx(dst, stride3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p2_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p2_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p2_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p2_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p2_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p2_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  if (__lsx_bz_v(flat)) {
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2_out = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1_out = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0_out = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0_out = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1_out = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2_out = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    __lsx_vst(p2_out, dst - stride3, 0);
+    __lsx_vst(p1_out, dst - stride2, 0);
+    __lsx_vst(p0_out, dst - stride, 0);
+    __lsx_vst(q0_out, dst, 0);
+    __lsx_vst(q1_out, dst + stride, 0);
+    __lsx_vst(q2_out, dst + stride2, 0);
+  }
+}
+
+void vpx_lpf_vertical_8_lsx(uint8_t *dst, int32_t stride,
+                            const uint8_t *b_limit_ptr,
+                            const uint8_t *limit_ptr,
+                            const uint8_t *thresh_ptr) {
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i zero = __lsx_vldi(0);
+
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+  uint8_t *dst_tmp = dst - 4;
+
+  /* load vector elements */
+  p3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p2, p1);
+  p0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  q0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q1, q2);
+  q3 = __lsx_vldx(dst_tmp, stride3);
+
+  LSX_TRANSPOSE8x8_B(p3, p2, p1, p0, q0, q1, q2, q3, p3, p2, p1, p0, q0, q1, q2,
+                     q3);
+
+  thresh = __lsx_vldrepl_b(thresh_ptr, 0);
+  b_limit = __lsx_vldrepl_b(b_limit_ptr, 0);
+  limit = __lsx_vldrepl_b(limit_ptr, 0);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+
+  flat = __lsx_vilvl_d(zero, flat);
+
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    /* Store 4 pixels p1-_q1 */
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vilvl_b, zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l,
+              p1_l, p0_l);
+    DUP4_ARG2(__lsx_vilvl_b, zero, q0, zero, q1, zero, q2, zero, q3, q0_l, q1_l,
+              q2_l, q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_l, p2_filt8_l, p1_filt8_l, p1_filt8_l,
+              p0_filt8_l, p0_filt8_l, q0_filt8_l, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_l, q1_filt8_l, q2_filt8_l, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    /* Store 6 pixels p2-_q2 */
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p1 = __lsx_vilvl_h(q3, p3);
+    p2 = __lsx_vilvh_h(q3, p3);
+    p3 = __lsx_vilvl_b(q2, q1);
+    dst -= 3;
+    __lsx_vstelm_w(p1, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p1, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_h(p3, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 1);
+    __lsx_vstelm_h(p3, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 2);
+    __lsx_vstelm_h(p3, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2, dst, 0, 3);
+    __lsx_vstelm_h(p3, dst, 4, 7);
+  }
+}
+
+void vpx_lpf_vertical_8_dual_lsx(uint8_t *dst, int32_t stride,
+                                 const uint8_t *b_limit0, const uint8_t *limit0,
+                                 const uint8_t *thresh0,
+                                 const uint8_t *b_limit1, const uint8_t *limit1,
+                                 const uint8_t *thresh1) {
+  uint8_t *dst_tmp = dst - 4;
+  __m128i p3, p2, p1, p0, q3, q2, q1, q0;
+  __m128i p1_out, p0_out, q0_out, q1_out;
+  __m128i flat, mask, hev, thresh, b_limit, limit;
+  __m128i row4, row5, row6, row7, row12, row13, row14, row15;
+  __m128i p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l;
+  __m128i p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h;
+  __m128i p2_filt8_l, p1_filt8_l, p0_filt8_l;
+  __m128i q0_filt8_l, q1_filt8_l, q2_filt8_l;
+  __m128i p2_filt8_h, p1_filt8_h, p0_filt8_h;
+  __m128i q0_filt8_h, q1_filt8_h, q2_filt8_h;
+  int32_t stride2 = stride << 1;
+  int32_t stride3 = stride2 + stride;
+  int32_t stride4 = stride2 << 1;
+
+  p0 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, p1, p2);
+  p3 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row4 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row5, row6);
+  row7 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+
+  q3 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, q2, q1);
+  q0 = __lsx_vldx(dst_tmp, stride3);
+  dst_tmp += stride4;
+  row12 = __lsx_vld(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vldx, dst_tmp, stride, dst_tmp, stride2, row13, row14);
+  row15 = __lsx_vldx(dst_tmp, stride3);
+
+  /* transpose 16x8 matrix into 8x16 */
+  LSX_TRANSPOSE16x8_B(p0, p1, p2, p3, row4, row5, row6, row7, q3, q2, q1, q0,
+                      row12, row13, row14, row15, p3, p2, p1, p0, q0, q1, q2,
+                      q3);
+
+  thresh = __lsx_vldrepl_b(thresh0, 0);
+  p1_out = __lsx_vldrepl_b(thresh1, 0);
+  thresh = __lsx_vilvl_d(p1_out, thresh);
+
+  b_limit = __lsx_vldrepl_b(b_limit0, 0);
+  p1_out = __lsx_vldrepl_b(b_limit1, 0);
+  b_limit = __lsx_vilvl_d(p1_out, b_limit);
+
+  limit = __lsx_vldrepl_b(limit0, 0);
+  p1_out = __lsx_vldrepl_b(limit1, 0);
+  limit = __lsx_vilvl_d(p1_out, limit);
+
+  /* mask and hev */
+  LPF_MASK_HEV(p3, p2, p1, p0, q0, q1, q2, q3, limit, b_limit, thresh, hev,
+               mask, flat);
+  /* flat4 */
+  VP9_FLAT4(p3, p2, p0, q0, q2, q3, flat);
+  /* filter4 */
+  VP9_LPF_FILTER4_4W(p1, p0, q0, q1, mask, hev, p1_out, p0_out, q0_out, q1_out);
+  /* if flat is zero for all pixels, then no need to calculate other filter */
+  if (__lsx_bz_v(flat)) {
+    DUP2_ARG2(__lsx_vilvl_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    p2 = __lsx_vilvl_h(p1, p0);
+    p3 = __lsx_vilvh_h(p1, p0);
+    DUP2_ARG2(__lsx_vilvh_b, p0_out, p1_out, q1_out, q0_out, p0, p1);
+    q2 = __lsx_vilvl_h(p1, p0);
+    q3 = __lsx_vilvh_h(p1, p0);
+    dst -= 2;
+    __lsx_vstelm_w(p2, dst, 0, 0);
+    __lsx_vstelm_w(p2, dst + stride, 0, 1);
+    __lsx_vstelm_w(p2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(p3, dst, 0, 0);
+    __lsx_vstelm_w(p3, dst + stride, 0, 1);
+    __lsx_vstelm_w(p3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(p3, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q2, dst, 0, 0);
+    __lsx_vstelm_w(q2, dst + stride, 0, 1);
+    __lsx_vstelm_w(q2, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q2, dst + stride3, 0, 3);
+    dst += stride4;
+    __lsx_vstelm_w(q3, dst, 0, 0);
+    __lsx_vstelm_w(q3, dst + stride, 0, 1);
+    __lsx_vstelm_w(q3, dst + stride2, 0, 2);
+    __lsx_vstelm_w(q3, dst + stride3, 0, 3);
+  } else {
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, p3, 0, p2, 0, p1, 0, p0, 0, p3_l, p2_l, p1_l,
+              p0_l);
+    DUP4_ARG2(__lsx_vsllwil_hu_bu, q0, 0, q1, 0, q2, 0, q3, 0, q0_l, q1_l, q2_l,
+              q3_l);
+    VP9_FILTER8(p3_l, p2_l, p1_l, p0_l, q0_l, q1_l, q2_l, q3_l, p2_filt8_l,
+                p1_filt8_l, p0_filt8_l, q0_filt8_l, q1_filt8_l, q2_filt8_l);
+    DUP4_ARG1(__lsx_vexth_hu_bu, p3, p2, p1, p0, p3_h, p2_h, p1_h, p0_h);
+    DUP4_ARG1(__lsx_vexth_hu_bu, q0, q1, q2, q3, q0_h, q1_h, q2_h, q3_h);
+
+    /* filter8 */
+    VP9_FILTER8(p3_h, p2_h, p1_h, p0_h, q0_h, q1_h, q2_h, q3_h, p2_filt8_h,
+                p1_filt8_h, p0_filt8_h, q0_filt8_h, q1_filt8_h, q2_filt8_h);
+
+    /* convert 16 bit output data into 8 bit */
+    DUP4_ARG2(__lsx_vpickev_b, p2_filt8_h, p2_filt8_l, p1_filt8_h, p1_filt8_l,
+              p0_filt8_h, p0_filt8_l, q0_filt8_h, q0_filt8_l, p2_filt8_l,
+              p1_filt8_l, p0_filt8_l, q0_filt8_l);
+    DUP2_ARG2(__lsx_vpickev_b, q1_filt8_h, q1_filt8_l, q2_filt8_h, q2_filt8_l,
+              q1_filt8_l, q2_filt8_l);
+
+    /* store pixel values */
+    p2 = __lsx_vbitsel_v(p2, p2_filt8_l, flat);
+    p1 = __lsx_vbitsel_v(p1_out, p1_filt8_l, flat);
+    p0 = __lsx_vbitsel_v(p0_out, p0_filt8_l, flat);
+    q0 = __lsx_vbitsel_v(q0_out, q0_filt8_l, flat);
+    q1 = __lsx_vbitsel_v(q1_out, q1_filt8_l, flat);
+    q2 = __lsx_vbitsel_v(q2, q2_filt8_l, flat);
+
+    DUP2_ARG2(__lsx_vilvl_b, p1, p2, q0, p0, p3, q3);
+    p2_filt8_l = __lsx_vilvl_h(q3, p3);
+    p2_filt8_h = __lsx_vilvh_h(q3, p3);
+    DUP2_ARG2(__lsx_vilvh_b, p1, p2, q0, p0, p3, q3);
+    p0_filt8_l = __lsx_vilvl_h(q3, p3);
+    p0_filt8_h = __lsx_vilvh_h(q3, p3);
+    q1_filt8_l = __lsx_vilvl_b(q2, q1);
+    q1_filt8_h = __lsx_vilvh_b(q2, q1);
+
+    dst -= 3;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p2_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_l, dst, 4, 7);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 0);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 1);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 2);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_l, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 3);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 0);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 4);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 1);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 5);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 2);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 6);
+    dst += stride;
+    __lsx_vstelm_w(p0_filt8_h, dst, 0, 3);
+    __lsx_vstelm_h(q1_filt8_h, dst, 4, 7);
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h b/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
new file mode 100644
index 0000000..1c43836
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/loopfilter_lsx.h
@@ -0,0 +1,167 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define LPF_MASK_HEV(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                     limit_in, b_limit_in, thresh_in, hev_out, mask_out,     \
+                     flat_out)                                               \
+  do {                                                                       \
+    __m128i p3_asub_p2_m, p2_asub_p1_m, p1_asub_p0_m, q1_asub_q0_m;          \
+    __m128i p1_asub_q1_m, p0_asub_q0_m, q3_asub_q2_m, q2_asub_q1_m;          \
+                                                                             \
+    /* absolute subtraction of pixel values */                               \
+    p3_asub_p2_m = __lsx_vabsd_bu(p3_in, p2_in);                             \
+    p2_asub_p1_m = __lsx_vabsd_bu(p2_in, p1_in);                             \
+    p1_asub_p0_m = __lsx_vabsd_bu(p1_in, p0_in);                             \
+    q1_asub_q0_m = __lsx_vabsd_bu(q1_in, q0_in);                             \
+    q2_asub_q1_m = __lsx_vabsd_bu(q2_in, q1_in);                             \
+    q3_asub_q2_m = __lsx_vabsd_bu(q3_in, q2_in);                             \
+    p0_asub_q0_m = __lsx_vabsd_bu(p0_in, q0_in);                             \
+    p1_asub_q1_m = __lsx_vabsd_bu(p1_in, q1_in);                             \
+                                                                             \
+    /* calculation of hev */                                                 \
+    flat_out = __lsx_vmax_bu(p1_asub_p0_m, q1_asub_q0_m);                    \
+    hev_out = __lsx_vslt_bu(thresh_in, flat_out);                            \
+                                                                             \
+    /* calculation of mask */                                                \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p0_asub_q0_m);               \
+    p1_asub_q1_m = __lsx_vsrli_b(p1_asub_q1_m, 1);                           \
+    p0_asub_q0_m = __lsx_vsadd_bu(p0_asub_q0_m, p1_asub_q1_m);               \
+    mask_out = __lsx_vslt_bu(b_limit_in, p0_asub_q0_m);                      \
+    mask_out = __lsx_vmax_bu(flat_out, mask_out);                            \
+    p3_asub_p2_m = __lsx_vmax_bu(p3_asub_p2_m, p2_asub_p1_m);                \
+    mask_out = __lsx_vmax_bu(p3_asub_p2_m, mask_out);                        \
+    q2_asub_q1_m = __lsx_vmax_bu(q2_asub_q1_m, q3_asub_q2_m);                \
+    mask_out = __lsx_vmax_bu(q2_asub_q1_m, mask_out);                        \
+                                                                             \
+    mask_out = __lsx_vslt_bu(limit_in, mask_out);                            \
+    mask_out = __lsx_vxori_b(mask_out, 0xff);                                \
+  } while (0)
+
+#define VP9_FLAT4(p3_in, p2_in, p0_in, q0_in, q2_in, q3_in, flat_out)          \
+  do {                                                                         \
+    __m128i p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0;                    \
+    __m128i flat4_tmp = __lsx_vldi(1);                                         \
+                                                                               \
+    DUP4_ARG2(__lsx_vabsd_bu, p2_in, p0_in, q2_in, q0_in, p3_in, p0_in, q3_in, \
+              q0_in, p2_asub_p0, q2_asub_q0, p3_asub_p0, q3_asub_q0);          \
+    p2_asub_p0 = __lsx_vmax_bu(p2_asub_p0, q2_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p2_asub_p0, flat_out);                            \
+    p3_asub_p0 = __lsx_vmax_bu(p3_asub_p0, q3_asub_q0);                        \
+    flat_out = __lsx_vmax_bu(p3_asub_p0, flat_out);                            \
+                                                                               \
+    flat_out = __lsx_vslt_bu(flat4_tmp, flat_out);                             \
+    flat_out = __lsx_vxori_b(flat_out, 0xff);                                  \
+    flat_out = flat_out & (mask);                                              \
+  } while (0)
+
+#define VP9_FLAT5(p7_in, p6_in, p5_in, p4_in, p0_in, q0_in, q4_in, q5_in,      \
+                  q6_in, q7_in, flat_in, flat2_out)                            \
+  do {                                                                         \
+    __m128i flat5_tmp = __lsx_vldi(1);                                         \
+    __m128i p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0;                    \
+    __m128i p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0;                    \
+    DUP4_ARG2(__lsx_vabsd_bu, p4_in, p0_in, q4_in, q0_in, p5_in, p0_in, q5_in, \
+              q0_in, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0);          \
+    DUP4_ARG2(__lsx_vabsd_bu, p6_in, p0_in, q6_in, q0_in, p7_in, p0_in, q7_in, \
+              q0_in, p6_asub_p0, q6_asub_q0, p7_asub_p0, q7_asub_q0);          \
+                                                                               \
+    DUP2_ARG2(__lsx_vmax_bu, p4_asub_p0, q4_asub_q0, p5_asub_p0, q5_asub_q0,   \
+              p4_asub_p0, flat2_out);                                          \
+    flat2_out = __lsx_vmax_bu(p4_asub_p0, flat2_out);                          \
+    p6_asub_p0 = __lsx_vmax_bu(p6_asub_p0, q6_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p6_asub_p0, flat2_out);                          \
+    p7_asub_p0 = __lsx_vmax_bu(p7_asub_p0, q7_asub_q0);                        \
+    flat2_out = __lsx_vmax_bu(p7_asub_p0, flat2_out);                          \
+    flat2_out = __lsx_vslt_bu(flat5_tmp, flat2_out);                           \
+    flat2_out = __lsx_vxori_b(flat2_out, 0xff);                                \
+    flat2_out = flat2_out & flat_in;                                           \
+  } while (0)
+
+#define VP9_LPF_FILTER4_4W(p1_in, p0_in, q0_in, q1_in, mask, hev, p1_out,  \
+                           p0_out, q0_out, q1_out)                         \
+  do {                                                                     \
+    __m128i p1_m, p0_m, q0_m, q1_m, filt, q0_sub_p0, t1, t2;               \
+    const __m128i cnst4b = __lsx_vldi(4);                                  \
+    const __m128i cnst3b = __lsx_vldi(3);                                  \
+    DUP4_ARG2(__lsx_vxori_b, p1_in, 0x80, p0_in, 0x80, q0_in, 0x80, q1_in, \
+              0x80, p1_m, p0_m, q0_m, q1_m);                               \
+    filt = __lsx_vssub_b(p1_m, q1_m);                                      \
+    filt &= hev;                                                           \
+                                                                           \
+    q0_sub_p0 = __lsx_vssub_b(q0_m, p0_m);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt = __lsx_vsadd_b(filt, q0_sub_p0);                                 \
+    filt &= mask;                                                          \
+    DUP2_ARG2(__lsx_vsadd_b, filt, cnst4b, filt, cnst3b, t1, t2);          \
+    DUP2_ARG2(__lsx_vsrai_b, t1, 3, t2, 3, t1, t2);                        \
+                                                                           \
+    q0_m = __lsx_vssub_b(q0_m, t1);                                        \
+    p0_m = __lsx_vsadd_b(p0_m, t2);                                        \
+    DUP2_ARG2(__lsx_vxori_b, q0_m, 0x80, p0_m, 0x80, q0_out, p0_out);      \
+                                                                           \
+    filt = __lsx_vsrari_b(t1, 1);                                          \
+    hev = __lsx_vxori_b(hev, 0xff);                                        \
+    filt &= hev;                                                           \
+    q1_m = __lsx_vssub_b(q1_m, filt);                                      \
+    p1_m = __lsx_vsadd_b(p1_m, filt);                                      \
+    DUP2_ARG2(__lsx_vxori_b, q1_m, 0x80, p1_m, 0x80, q1_out, p1_out);      \
+  } while (0)
+
+#define VP9_FILTER8(p3_in, p2_in, p1_in, p0_in, q0_in, q1_in, q2_in, q3_in, \
+                    p2_filt8_out, p1_filt8_out, p0_filt8_out, q0_filt8_out, \
+                    q1_filt8_out, q2_filt8_out)                             \
+  do {                                                                      \
+    __m128i tmp_filt8_0, tmp_filt8_1, tmp_filt8_2;                          \
+                                                                            \
+    tmp_filt8_2 = __lsx_vadd_h(p2_in, p1_in);                               \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vslli_h(p3_in, 1);                                  \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_2);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, p2_in);                         \
+    p2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, p1_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q1_in);                         \
+    p1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vadd_h(q2_in, q1_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, q0_in);                         \
+    tmp_filt8_2 = __lsx_vadd_h(tmp_filt8_2, tmp_filt8_1);                   \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, p0_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, p3_in);                         \
+    p0_filt8_out = __lsx_vsrari_h(tmp_filt8_0, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(q2_in, q3_in);                               \
+    tmp_filt8_0 = __lsx_vadd_h(p0_in, tmp_filt8_0);                         \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    tmp_filt8_1 = __lsx_vadd_h(q3_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_1, tmp_filt8_0);                   \
+    q2_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_0 = __lsx_vadd_h(tmp_filt8_2, q3_in);                         \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, q0_in);                         \
+    q0_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+                                                                            \
+    tmp_filt8_1 = __lsx_vsub_h(tmp_filt8_0, p2_in);                         \
+    tmp_filt8_0 = __lsx_vadd_h(q1_in, q3_in);                               \
+    tmp_filt8_1 = __lsx_vadd_h(tmp_filt8_0, tmp_filt8_1);                   \
+    q1_filt8_out = __lsx_vsrari_h(tmp_filt8_1, 3);                          \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_LOOPFILTER_LSX_H_
diff --git a/libvpx/vpx_dsp/loongarch/quantize_lsx.c b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
new file mode 100644
index 0000000..2fc33b0
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/quantize_lsx.c
@@ -0,0 +1,253 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i calculate_qcoeff(__m128i coeff, __m128i coeff_abs,
+                                       __m128i round, __m128i quant,
+                                       __m128i shift, __m128i cmp_mask) {
+  __m128i rounded, qcoeff;
+
+  rounded = __lsx_vsadd_h(coeff_abs, round);
+  qcoeff = __lsx_vmuh_h(rounded, quant);
+  qcoeff = __lsx_vadd_h(rounded, qcoeff);
+  qcoeff = __lsx_vmuh_h(qcoeff, shift);
+  qcoeff = __lsx_vsigncov_h(coeff, qcoeff);
+  qcoeff = __lsx_vand_v(qcoeff, cmp_mask);
+
+  return qcoeff;
+}
+
+static INLINE void calculate_dqcoeff_and_store(__m128i qcoeff, __m128i dequant,
+                                               int16_t *dqcoeff) {
+  __m128i dqcoeff16 = __lsx_vmul_h(qcoeff, dequant);
+  __lsx_vst(dqcoeff16, dqcoeff, 0);
+}
+
+static INLINE void calculate_dqcoeff_and_store_32x32(__m128i qcoeff,
+                                                     __m128i dequant,
+                                                     int16_t *dqcoeff) {
+  // Un-sign to bias rounding like C.
+  __m128i low, high, dqcoeff32_0, dqcoeff32_1, res;
+  __m128i zero = __lsx_vldi(0);
+  __m128i coeff = __lsx_vabsd_h(qcoeff, zero);
+
+  const __m128i sign_0 = __lsx_vilvl_h(qcoeff, zero);
+  const __m128i sign_1 = __lsx_vilvh_h(qcoeff, zero);
+
+  low = __lsx_vmul_h(coeff, dequant);
+  high = __lsx_vmuh_h(coeff, dequant);
+  dqcoeff32_0 = __lsx_vilvl_h(high, low);
+  dqcoeff32_1 = __lsx_vilvh_h(high, low);
+
+  // "Divide" by 2.
+  dqcoeff32_0 = __lsx_vsrai_w(dqcoeff32_0, 1);
+  dqcoeff32_1 = __lsx_vsrai_w(dqcoeff32_1, 1);
+  dqcoeff32_0 = __lsx_vsigncov_w(sign_0, dqcoeff32_0);
+  dqcoeff32_1 = __lsx_vsigncov_w(sign_1, dqcoeff32_1);
+  res = __lsx_vpickev_h(dqcoeff32_1, dqcoeff32_0);
+  __lsx_vst(res, dqcoeff, 0);
+}
+
+static INLINE __m128i scan_for_eob(__m128i coeff0, __m128i coeff1,
+                                   __m128i zbin_mask0, __m128i zbin_mask1,
+                                   const int16_t *scan, int index,
+                                   __m128i zero) {
+  const __m128i zero_coeff0 = __lsx_vseq_h(coeff0, zero);
+  const __m128i zero_coeff1 = __lsx_vseq_h(coeff1, zero);
+  __m128i scan0 = __lsx_vld(scan + index, 0);
+  __m128i scan1 = __lsx_vld(scan + index + 8, 0);
+  __m128i eob0, eob1;
+
+  scan0 = __lsx_vsub_h(scan0, zbin_mask0);
+  scan1 = __lsx_vsub_h(scan1, zbin_mask1);
+  eob0 = __lsx_vandn_v(zero_coeff0, scan0);
+  eob1 = __lsx_vandn_v(zero_coeff1, scan1);
+  return __lsx_vmax_h(eob0, eob1);
+}
+
+static INLINE int16_t accumulate_eob(__m128i eob) {
+  __m128i eob_shuffled;
+  int16_t res_m;
+
+  eob_shuffled = __lsx_vshuf4i_w(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0xe);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  eob_shuffled = __lsx_vshuf4i_h(eob, 0x1);
+  eob = __lsx_vmax_h(eob, eob_shuffled);
+  res_m = __lsx_vpickve2gr_h(eob, 1);
+
+  return res_m;
+}
+
+#if !CONFIG_VP9_HIGHBITDEPTH
+void vpx_quantize_b_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
+                        const int16_t *quant_shift_ptr, int16_t *qcoeff_ptr,
+                        int16_t *dqcoeff_ptr, const int16_t *dequant_ptr,
+                        uint16_t *eob_ptr, const int16_t *scan,
+                        const int16_t *iscan) {
+  __m128i zero = __lsx_vldi(0);
+  int index = 16;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1;
+  __m128i qcoeff0, qcoeff1;
+  __m128i cmp_mask0, cmp_mask1;
+  __m128i eob, eob0;
+
+  (void)scan;
+
+  zbin = __lsx_vld(zbin_ptr, 0);
+  round = __lsx_vld(round_ptr, 0);
+  quant = __lsx_vld(quant_ptr, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + 8);
+
+  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  // AC only loop.
+  while (index < n_coeffs) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store(qcoeff1, dequant, dqcoeff_ptr + index + 8);
+
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = __lsx_vmax_h(eob, eob0);
+
+    index += 16;
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+
+void vpx_quantize_b_32x32_lsx(const int16_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              int16_t *qcoeff_ptr, int16_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan, const int16_t *iscan) {
+  __m128i zero = __lsx_vldi(0);
+  int index;
+
+  __m128i zbin, round, quant, dequant, quant_shift;
+  __m128i coeff0, coeff1, qcoeff0, qcoeff1, cmp_mask0, cmp_mask1;
+  __m128i eob = zero, eob0;
+
+  (void)scan;
+  (void)n_coeffs;
+
+  zbin = __lsx_vld(zbin_ptr, 0);
+  zbin = __lsx_vsrari_h(zbin, 1);
+  round = __lsx_vld(round_ptr, 0);
+  round = __lsx_vsrari_h(round, 1);
+
+  quant = __lsx_vld(quant_ptr, 0);
+  dequant = __lsx_vld(dequant_ptr, 0);
+  quant_shift = __lsx_vld(quant_shift_ptr, 0);
+  quant_shift = __lsx_vslli_h(quant_shift, 1);
+  // Handle one DC and first 15 AC.
+  DUP2_ARG2(__lsx_vld, coeff_ptr, 0, coeff_ptr, 16, coeff0, coeff1);
+  qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+  qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+  cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+  // remove DC from zbin
+  zbin = __lsx_vilvh_d(zbin, zbin);
+  cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+  qcoeff0 =
+      calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+  // remove DC in quant_shift, quant, quant_shift
+  round = __lsx_vilvh_d(round, round);
+  quant = __lsx_vilvh_d(quant, quant);
+  quant_shift = __lsx_vilvh_d(quant_shift, quant_shift);
+  qcoeff1 =
+      calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+  __lsx_vst(qcoeff0, qcoeff_ptr, 0);
+  __lsx_vst(qcoeff1, qcoeff_ptr, 16);
+
+  calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr);
+  dequant = __lsx_vilvh_d(dequant, dequant);
+  calculate_dqcoeff_and_store_32x32(qcoeff1, dequant, dqcoeff_ptr + 8);
+  eob = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, 0, zero);
+  // AC only loop.
+  for (index = 16; index < 32 * 32; index += 16) {
+    coeff0 = __lsx_vld(coeff_ptr + index, 0);
+    coeff1 = __lsx_vld(coeff_ptr + index + 8, 0);
+
+    qcoeff0 = __lsx_vabsd_h(coeff0, zero);
+    qcoeff1 = __lsx_vabsd_h(coeff1, zero);
+
+    cmp_mask0 = __lsx_vsle_h(zbin, qcoeff0);
+    cmp_mask1 = __lsx_vsle_h(zbin, qcoeff1);
+
+    qcoeff0 =
+        calculate_qcoeff(coeff0, qcoeff0, round, quant, quant_shift, cmp_mask0);
+    qcoeff1 =
+        calculate_qcoeff(coeff1, qcoeff1, round, quant, quant_shift, cmp_mask1);
+    __lsx_vst(qcoeff0, qcoeff_ptr + index, 0);
+    __lsx_vst(qcoeff1, qcoeff_ptr + index + 8, 0);
+
+    calculate_dqcoeff_and_store_32x32(qcoeff0, dequant, dqcoeff_ptr + index);
+    calculate_dqcoeff_and_store_32x32(qcoeff1, dequant,
+                                      dqcoeff_ptr + 8 + index);
+    eob0 = scan_for_eob(qcoeff0, qcoeff1, cmp_mask0, cmp_mask1, iscan, index,
+                        zero);
+    eob = __lsx_vmax_h(eob, eob0);
+  }
+
+  *eob_ptr = accumulate_eob(eob);
+}
+#endif
diff --git a/libvpx/vpx_dsp/loongarch/sad_lsx.c b/libvpx/vpx_dsp/loongarch/sad_lsx.c
new file mode 100644
index 0000000..b6fbedb
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/sad_lsx.c
@@ -0,0 +1,717 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i sad_ub2_uh(__m128i in0, __m128i in1, __m128i ref0,
+                                 __m128i ref1) {
+  __m128i diff0_m, diff1_m, sad_m0;
+  __m128i sad_m = __lsx_vldi(0);
+
+  diff0_m = __lsx_vabsd_bu(in0, ref0);
+  diff1_m = __lsx_vabsd_bu(in1, ref1);
+
+  sad_m0 = __lsx_vhaddw_hu_bu(diff0_m, diff0_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+  sad_m0 = __lsx_vhaddw_hu_bu(diff1_m, diff1_m);
+  sad_m = __lsx_vadd_h(sad_m, sad_m0);
+
+  return sad_m;
+}
+
+static INLINE uint32_t hadd_uw_u32(__m128i in) {
+  __m128i res0_m;
+  uint32_t sum_m;
+
+  res0_m = __lsx_vhaddw_du_wu(in, in);
+  res0_m = __lsx_vhaddw_qu_du(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static INLINE uint32_t hadd_uh_u32(__m128i in) {
+  __m128i res_m;
+  uint32_t sum_m;
+
+  res_m = __lsx_vhaddw_wu_hu(in, in);
+  sum_m = hadd_uw_u32(res_m);
+
+  return sum_m;
+}
+
+static INLINE int32_t hadd_sw_s32(__m128i in) {
+  __m128i res0_m;
+  int32_t sum_m;
+
+  res0_m = __lsx_vhaddw_d_w(in, in);
+  res0_m = __lsx_vhaddw_q_d(res0_m, res0_m);
+  sum_m = __lsx_vpickve2gr_w(res0_m, 0);
+
+  return sum_m;
+}
+
+static uint32_t sad_8width_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               int32_t height) {
+  int32_t ht_cnt;
+  uint32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (ht_cnt = (height >> 2); ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src1, ref1);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src2, ref2);
+    src += src_stride;
+    ref += ref_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src3, ref3);
+    src += src_stride;
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_16width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, ref, 0, src0, ref0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, ref, ref_stride, src1, ref1);
+    src += src_stride2;
+    ref += ref_stride2;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 2);
+  uint32_t res;
+  __m128i src0, src1, ref0, ref1;
+  __m128i sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, ref, 0, ref, 16, ref0, ref1);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t sad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *ref, int32_t ref_stride,
+                                int32_t height) {
+  int32_t ht_cnt = (height >> 1);
+  uint32_t sad = 0;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+
+  sad = hadd_uh_u32(sad0);
+  sad += hadd_uh_u32(sad1);
+
+  return sad;
+}
+
+static void sad_8width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                               const uint8_t *const aref_ptr[],
+                               int32_t ref_stride, int32_t height,
+                               uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 2);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src0, src1, src2, src3, sad_tmp;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i ref8, ref9, ref10, ref11, ref12, ref13, ref14, ref15;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src0 = __lsx_vld(src_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, src_ptr, src_stride, src_ptr, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_ptr, src_stride3);
+    src_ptr += src_stride4;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref0_ptr, ref_stride, ref0_ptr, ref_stride2, ref1,
+              ref2);
+    ref3 = __lsx_vldx(ref0_ptr, ref_stride3);
+    ref0_ptr += ref_stride4;
+    ref4 = __lsx_vld(ref1_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref1_ptr, ref_stride, ref1_ptr, ref_stride2, ref5,
+              ref6);
+    ref7 = __lsx_vldx(ref1_ptr, ref_stride3);
+    ref1_ptr += ref_stride4;
+    ref8 = __lsx_vld(ref2_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref2_ptr, ref_stride, ref2_ptr, ref_stride2, ref9,
+              ref10);
+    ref11 = __lsx_vldx(ref2_ptr, ref_stride3);
+    ref2_ptr += ref_stride4;
+    ref12 = __lsx_vld(ref3_ptr, 0);
+    DUP2_ARG2(__lsx_vldx, ref3_ptr, ref_stride, ref3_ptr, ref_stride2, ref13,
+              ref14);
+    ref15 = __lsx_vldx(ref3_ptr, ref_stride3);
+    ref3_ptr += ref_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, src0, src1);
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref5, ref4, ref7, ref6, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref9, ref8, ref11, ref10, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vpickev_d, ref13, ref12, ref15, ref14, ref0, ref1);
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_16width_x4d_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  int32_t ht_cnt = (height >> 1);
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  __m128i src, ref0, ref1, ref2, ref3, diff, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref0 = __lsx_vld(ref0_ptr, 0);
+    ref0_ptr += ref_stride;
+    ref1 = __lsx_vld(ref1_ptr, 0);
+    ref1_ptr += ref_stride;
+    ref2 = __lsx_vld(ref2_ptr, 0);
+    ref2_ptr += ref_stride;
+    ref3 = __lsx_vld(ref3_ptr, 0);
+    ref3_ptr += ref_stride;
+
+    diff = __lsx_vabsd_bu(src, ref0);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref1);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref2);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+    diff = __lsx_vabsd_bu(src, ref3);
+    sad_tmp = __lsx_vhaddw_hu_bu(diff, diff);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_32width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, ref0, ref1, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+  __m128i sad2 = sad0;
+  __m128i sad3 = sad0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src1);
+    src += src_stride;
+
+    DUP2_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0, ref1);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref0, ref1);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref0, ref1);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2 = __lsx_vadd_h(sad2, sad_tmp);
+
+    DUP2_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref0, ref1);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3 = __lsx_vadd_h(sad3, sad_tmp);
+  }
+  sad_array[0] = hadd_uh_u32(sad0);
+  sad_array[1] = hadd_uh_u32(sad1);
+  sad_array[2] = hadd_uh_u32(sad2);
+  sad_array[3] = hadd_uh_u32(sad3);
+}
+
+static void sad_64width_x4d_lsx(const uint8_t *src, int32_t src_stride,
+                                const uint8_t *const aref_ptr[],
+                                int32_t ref_stride, int32_t height,
+                                uint32_t *sad_array) {
+  const uint8_t *ref0_ptr, *ref1_ptr, *ref2_ptr, *ref3_ptr;
+  int32_t ht_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i sad, sad_tmp;
+
+  __m128i sad0_0 = __lsx_vldi(0);
+  __m128i sad0_1 = sad0_0;
+  __m128i sad1_0 = sad0_0;
+  __m128i sad1_1 = sad0_0;
+  __m128i sad2_0 = sad0_0;
+  __m128i sad2_1 = sad0_0;
+  __m128i sad3_0 = sad0_0;
+  __m128i sad3_1 = sad0_0;
+
+  ref0_ptr = aref_ptr[0];
+  ref1_ptr = aref_ptr[1];
+  ref2_ptr = aref_ptr[2];
+  ref3_ptr = aref_ptr[3];
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, ref0_ptr, 0, ref0_ptr, 16, ref0_ptr, 32, ref0_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref0_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad0_0 = __lsx_vadd_h(sad0_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad0_1 = __lsx_vadd_h(sad0_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref1_ptr, 0, ref1_ptr, 16, ref1_ptr, 32, ref1_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref1_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad1_0 = __lsx_vadd_h(sad1_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad1_1 = __lsx_vadd_h(sad1_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref2_ptr, 0, ref2_ptr, 16, ref2_ptr, 32, ref2_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref2_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad2_0 = __lsx_vadd_h(sad2_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad2_1 = __lsx_vadd_h(sad2_1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, ref3_ptr, 0, ref3_ptr, 16, ref3_ptr, 32, ref3_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref3_ptr += ref_stride;
+    sad_tmp = sad_ub2_uh(src0, src1, ref0, ref1);
+    sad3_0 = __lsx_vadd_h(sad3_0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, ref2, ref3);
+    sad3_1 = __lsx_vadd_h(sad3_1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0_0, sad0_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad0_1, sad0_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[0] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad1_0, sad1_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1_1, sad1_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[1] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad2_0, sad2_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad2_1, sad2_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[2] = hadd_uw_u32(sad);
+
+  sad = __lsx_vhaddw_wu_hu(sad3_0, sad3_0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad3_1, sad3_1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+  sad_array[3] = hadd_uw_u32(sad);
+}
+
+static uint32_t avgsad_32width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, ref4, ref5, ref6, ref7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i comp0, comp1, sad_tmp;
+  __m128i sad = __lsx_vldi(0);
+  uint8_t *src_tmp, *ref_tmp;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    src_tmp = (uint8_t *)src + 16;
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src1 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp, src_stride3);
+    src += src_stride4;
+
+    ref_tmp = (uint8_t *)ref + 16;
+    ref0 = __lsx_vld(ref, 0);
+    DUP2_ARG2(__lsx_vldx, ref, ref_stride, ref, ref_stride2, ref2, ref4);
+    ref6 = __lsx_vldx(ref, ref_stride3);
+    ref1 = __lsx_vld(ref_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, ref_tmp, ref_stride, ref_tmp, ref_stride2, ref3,
+              ref5);
+    ref7 = __lsx_vldx(ref_tmp, ref_stride3);
+    ref += ref_stride4;
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 32, sec_pred, 64, sec_pred, 96,
+              pred0, pred2, pred4, pred6);
+    DUP4_ARG2(__lsx_vld, sec_pred, 16, sec_pred, 48, sec_pred, 80, sec_pred,
+              112, pred1, pred3, pred5, pred7);
+    sec_pred += 128;
+
+    DUP2_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred2, ref2, pred3, ref3, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src2, src3, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred4, ref4, pred5, ref5, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src4, src5, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+    DUP2_ARG2(__lsx_vavgr_bu, pred6, ref6, pred7, ref7, comp0, comp1);
+    sad_tmp = sad_ub2_uh(src6, src7, comp0, comp1);
+    sad = __lsx_vadd_h(sad, sad_tmp);
+  }
+  res = hadd_uh_u32(sad);
+  return res;
+}
+
+static uint32_t avgsad_64width_lsx(const uint8_t *src, int32_t src_stride,
+                                   const uint8_t *ref, int32_t ref_stride,
+                                   int32_t height, const uint8_t *sec_pred) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i comp0, comp1, comp2, comp3, pred0, pred1, pred2, pred3;
+  __m128i sad, sad_tmp;
+  __m128i sad0 = __lsx_vldi(0);
+  __m128i sad1 = sad0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, ref, 0, ref, 16, ref, 32, ref, 48, ref0, ref1, ref2,
+              ref3);
+    ref += ref_stride;
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vavgr_bu, pred0, ref0, pred1, ref1, pred2, ref2, pred3,
+              ref3, comp0, comp1, comp2, comp3);
+    sad_tmp = sad_ub2_uh(src0, src1, comp0, comp1);
+    sad0 = __lsx_vadd_h(sad0, sad_tmp);
+    sad_tmp = sad_ub2_uh(src2, src3, comp2, comp3);
+    sad1 = __lsx_vadd_h(sad1, sad_tmp);
+  }
+  sad = __lsx_vhaddw_wu_hu(sad0, sad0);
+  sad_tmp = __lsx_vhaddw_wu_hu(sad1, sad1);
+  sad = __lsx_vadd_w(sad, sad_tmp);
+
+  res = hadd_sw_s32(sad);
+  return res;
+}
+
+#define VPX_SAD_8xHT_LSX(height)                                             \
+  uint32_t vpx_sad8x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                   const uint8_t *ref, int32_t ref_stride) { \
+    return sad_8width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_16xHT_LSX(height)                                             \
+  uint32_t vpx_sad16x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_16width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_32xHT_LSX(height)                                             \
+  uint32_t vpx_sad32x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_32width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_64xHT_LSX(height)                                             \
+  uint32_t vpx_sad64x##height##_lsx(const uint8_t *src, int32_t src_stride,   \
+                                    const uint8_t *ref, int32_t ref_stride) { \
+    return sad_64width_lsx(src, src_stride, ref, ref_stride, height);         \
+  }
+
+#define VPX_SAD_8xHTx4D_LSX(height)                                       \
+  void vpx_sad8x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
+    sad_8width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_16xHTx4D_LSX(height)                                       \
+  void vpx_sad16x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_16width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_32xHTx4D_LSX(height)                                       \
+  void vpx_sad32x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_32width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_SAD_64xHTx4D_LSX(height)                                       \
+  void vpx_sad64x##height##x4d_lsx(const uint8_t *src, int32_t src_stride, \
+                                   const uint8_t *const refs[],            \
+                                   int32_t ref_stride, uint32_t *sads) {   \
+    sad_64width_x4d_lsx(src, src_stride, refs, ref_stride, height, sads);  \
+  }
+
+#define VPX_AVGSAD_32xHT_LSX(height)                                    \
+  uint32_t vpx_sad32x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_32width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define VPX_AVGSAD_64xHT_LSX(height)                                    \
+  uint32_t vpx_sad64x##height##_avg_lsx(                                \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,       \
+      int32_t ref_stride, const uint8_t *second_pred) {                 \
+    return avgsad_64width_lsx(src, src_stride, ref, ref_stride, height, \
+                              second_pred);                             \
+  }
+
+#define SAD64                                                             \
+  VPX_SAD_64xHT_LSX(64) VPX_SAD_64xHTx4D_LSX(64) VPX_SAD_64xHTx4D_LSX(32) \
+      VPX_AVGSAD_64xHT_LSX(64)
+
+SAD64
+
+#define SAD32                                                             \
+  VPX_SAD_32xHT_LSX(32) VPX_SAD_32xHTx4D_LSX(32) VPX_SAD_32xHTx4D_LSX(64) \
+      VPX_AVGSAD_32xHT_LSX(32)
+
+SAD32
+
+#define SAD16 VPX_SAD_16xHT_LSX(16) VPX_SAD_16xHTx4D_LSX(16)
+
+SAD16
+
+#define SAD8 VPX_SAD_8xHT_LSX(8) VPX_SAD_8xHTx4D_LSX(8)
+
+SAD8
+
+#undef SAD64
+#undef SAD32
+#undef SAD16
+#undef SAD8
diff --git a/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c b/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
new file mode 100644
index 0000000..7007935
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/sub_pixel_variance_lsx.c
@@ -0,0 +1,874 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_ports/mem.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+#include "vpx_dsp/variance.h"
+
+static const uint8_t bilinear_filters_lsx[8][2] = {
+  { 128, 0 }, { 112, 16 }, { 96, 32 }, { 80, 48 },
+  { 64, 64 }, { 48, 80 },  { 32, 96 }, { 16, 112 },
+};
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t avg_sse_diff_64x64_lsx(const uint8_t *src_ptr,
+                                       int32_t src_stride,
+                                       const uint8_t *ref_ptr,
+                                       int32_t ref_stride,
+                                       const uint8_t *sec_pred, int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i pred0, pred1, pred2, pred3, vec, vec_tmp;
+  __m128i avg0, avg1, avg2, avg3;
+  __m128i var = __lsx_vldi(0);
+
+  avg0 = var;
+  avg1 = var;
+  avg2 = var;
+  avg3 = var;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+
+    DUP4_ARG2(__lsx_vld, sec_pred, 0, sec_pred, 16, sec_pred, 32, sec_pred, 48,
+              pred0, pred1, pred2, pred3);
+    sec_pred += 64;
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, pred0, src1, pred1, src2, pred2, src3,
+              pred3, src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec = __lsx_vhaddw_w_h(avg0, avg0);
+  vec_tmp = __lsx_vhaddw_w_h(avg1, avg1);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg2, avg2);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  vec_tmp = __lsx_vhaddw_w_h(avg3, avg3);
+  vec = __lsx_vadd_w(vec, vec_tmp);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1, vec2, vec3, filt0, out, vec;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, vec0, vec0, FILTER_BITS, vec1, vec1,
+              FILTER_BITS, vec2, vec2, FILTER_BITS, vec3, vec3, FILTER_BITS,
+              src0, src1, src2, src3);
+    out = __lsx_vpackev_d(src1, src0);
+    CALC_MSE_AVG_B(out, ref0, var, avg);
+    out = __lsx_vpackev_d(src3, src2);
+    CALC_MSE_AVG_B(out, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i vec, var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              src0, src1, src2, src3);
+    CALC_MSE_AVG_B(src0, dst0, var, avg);
+    CALC_MSE_AVG_B(src1, dst1, var, avg);
+    CALC_MSE_AVG_B(src2, dst2, var, avg);
+    CALC_MSE_AVG_B(src3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_h_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i vec, vec0, vec1, vec2, vec3, tmp0, tmp1, tmp2, tmp3, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, src0, src1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    src0 = src4;
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4;
+  __m128i out0, out1, out2, out3, tmp0, tmp1, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter, int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_v_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height, &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_sse_diff_8width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, src0, src1, src2, src3, src4, out0, out1;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3, vec, vec0, filt_hz, filt_vt;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src1, ref0);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src2, ref1);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src3, ref2);
+    src += src_stride;
+    dst += dst_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, dst, 0, src4, ref3);
+    src += src_stride;
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vpickev_d, ref1, ref0, ref3, ref2, ref0, ref1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out1);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out0);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_16width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, filt_hz, filt_vt, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, vec;
+  __m128i var = __lsx_vldi(0);
+  __m128i avg = var;
+  __m128i mask = { 0x0403030202010100, 0x0807070606050504 };
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    ref0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, ref1, ref2);
+    ref3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    src3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+    CALC_MSE_AVG_B(src2, ref2, var, avg);
+    CALC_MSE_AVG_B(src3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t sub_pixel_sse_diff_32width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *filter_horiz, const uint8_t *filter_vert,
+    int32_t height, int32_t *diff) {
+  uint32_t sse = 0;
+  int32_t diff0[2];
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[0]);
+  src += 16;
+  dst += 16;
+
+  sse += sub_pixel_sse_diff_16width_hv_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height,
+                                           &diff0[1]);
+
+  *diff = diff0[0] + diff0[1];
+
+  return sse;
+}
+
+static uint32_t subpel_avg_ssediff_16w_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, tmp0, tmp1, tmp2, tmp3;
+  __m128i pred0, pred1, pred2, pred3, filt0, vec;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    dst3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vavgr_bu, tmp0, pred0, tmp1, pred1, tmp2, pred2, tmp3,
+              pred3, tmp0, tmp1, tmp2, tmp3);
+
+    CALC_MSE_AVG_B(tmp0, dst0, var, avg);
+    CALC_MSE_AVG_B(tmp1, dst1, var, avg);
+    CALC_MSE_AVG_B(tmp2, dst2, var, avg);
+    CALC_MSE_AVG_B(tmp3, dst3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i src0, src1, src2, src3, src4, out0, out1, out2, out3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1, vec, filt0;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    src += src_stride;
+    src2 = __lsx_vld(src, 0);
+    src += src_stride;
+    src3 = __lsx_vld(src, 0);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    src0 = src4;
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t subpel_avg_ssediff_16w_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff, int32_t width) {
+  uint32_t loop_cnt = (height >> 2);
+  int32_t res;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i ref0, ref1, ref2, ref3, pred0, pred1, pred2, pred3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1;
+  __m128i out0, out1, out2, out3, filt_hz, filt_vt, vec, vec0, vec1;
+  __m128i mask = { 0x403030202010100, 0x807070606050504 };
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out0);
+  HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out2);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src4, src5);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src6, src7);
+    src += src_stride;
+
+    pred0 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred1 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred2 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+    pred3 = __lsx_vld(sec_pred, 0);
+    sec_pred += width;
+
+    HORIZ_2TAP_FILT_UH(src0, src0, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src2, src2, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src3, src3, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out1 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src4, src4, mask, filt_hz, FILTER_BITS, hz_out1);
+    HORIZ_2TAP_FILT_UH(src5, src5, mask, filt_hz, FILTER_BITS, hz_out3);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out2 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    HORIZ_2TAP_FILT_UH(src6, src6, mask, filt_hz, FILTER_BITS, hz_out0);
+    HORIZ_2TAP_FILT_UH(src7, src7, mask, filt_hz, FILTER_BITS, hz_out2);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    out3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+    ref0 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref1 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref2 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+    ref3 = __lsx_vld(dst, 0);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, out0, pred0, out1, pred1, out2, pred2, out3,
+              pred3, out0, out1, out2, out3);
+
+    CALC_MSE_AVG_B(out0, ref0, var, avg);
+    CALC_MSE_AVG_B(out1, ref1, var, avg);
+    CALC_MSE_AVG_B(out2, ref2, var, avg);
+    CALC_MSE_AVG_B(out3, ref3, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_h_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_h_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_v_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter,
+    int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse +=
+        subpel_avg_ssediff_16w_v_lsx(src, src_stride, dst, dst_stride, sec_pred,
+                                     filter, height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+static uint32_t sub_pixel_avg_sse_diff_64width_hv_lsx(
+    const uint8_t *src, int32_t src_stride, const uint8_t *dst,
+    int32_t dst_stride, const uint8_t *sec_pred, const uint8_t *filter_horiz,
+    const uint8_t *filter_vert, int32_t height, int32_t *diff) {
+  uint32_t loop_cnt, sse = 0;
+  int32_t diff0[4];
+
+  for (loop_cnt = 0; loop_cnt < 4; ++loop_cnt) {
+    sse += subpel_avg_ssediff_16w_hv_lsx(src, src_stride, dst, dst_stride,
+                                         sec_pred, filter_horiz, filter_vert,
+                                         height, &diff0[loop_cnt], 64);
+    src += 16;
+    dst += 16;
+    sec_pred += 16;
+  }
+
+  *diff = diff0[0] + diff0[1] + diff0[2] + diff0[3];
+
+  return sse;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(wd, ht)                              \
+  uint32_t vpx_sub_pixel_variance##wd##x##ht##_lsx(                           \
+      const uint8_t *src, int32_t src_stride, int32_t x_offset,               \
+      int32_t y_offset, const uint8_t *ref, int32_t ref_stride,               \
+      uint32_t *sse) {                                                        \
+    int32_t diff;                                                             \
+    uint32_t var;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_hv_lsx(                         \
+            src, src_stride, ref, ref_stride, h_filter, v_filter, ht, &diff); \
+      } else {                                                                \
+        *sse = sub_pixel_sse_diff_##wd##width_v_lsx(                          \
+            src, src_stride, ref, ref_stride, v_filter, ht, &diff);           \
+      }                                                                       \
+                                                                              \
+      var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                             \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_sse_diff_##wd##width_h_lsx(                          \
+            src, src_stride, ref, ref_stride, h_filter, ht, &diff);           \
+                                                                              \
+        var = VARIANCE_##wd##Wx##ht##H(*sse, diff);                           \
+      } else {                                                                \
+        var = vpx_variance##wd##x##ht##_lsx(src, src_stride, ref, ref_stride, \
+                                            sse);                             \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return var;                                                               \
+  }
+
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(8, 8)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(16, 16)
+VPX_SUB_PIXEL_VARIANCE_WDXHT_LSX(32, 32)
+
+#define VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(ht)                           \
+  uint32_t vpx_sub_pixel_avg_variance64x##ht##_lsx(                           \
+      const uint8_t *src_ptr, int32_t src_stride, int32_t x_offset,           \
+      int32_t y_offset, const uint8_t *ref_ptr, int32_t ref_stride,           \
+      uint32_t *sse, const uint8_t *sec_pred) {                               \
+    int32_t diff;                                                             \
+    const uint8_t *h_filter = bilinear_filters_lsx[x_offset];                 \
+    const uint8_t *v_filter = bilinear_filters_lsx[y_offset];                 \
+                                                                              \
+    if (y_offset) {                                                           \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_hv_lsx(                         \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter,     \
+            v_filter, ht, &diff);                                             \
+      } else {                                                                \
+        *sse = sub_pixel_avg_sse_diff_64width_v_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, v_filter, ht, \
+            &diff);                                                           \
+      }                                                                       \
+    } else {                                                                  \
+      if (x_offset) {                                                         \
+        *sse = sub_pixel_avg_sse_diff_64width_h_lsx(                          \
+            src_ptr, src_stride, ref_ptr, ref_stride, sec_pred, h_filter, ht, \
+            &diff);                                                           \
+      } else {                                                                \
+        *sse = avg_sse_diff_64x##ht##_lsx(src_ptr, src_stride, ref_ptr,       \
+                                          ref_stride, sec_pred, &diff);       \
+      }                                                                       \
+    }                                                                         \
+                                                                              \
+    return VARIANCE_64Wx##ht##H(*sse, diff);                                  \
+  }
+
+VPX_SUB_PIXEL_AVG_VARIANCE64XHEIGHT_LSX(64)
diff --git a/libvpx/vpx_dsp/loongarch/subtract_lsx.c b/libvpx/vpx_dsp/loongarch/subtract_lsx.c
new file mode 100644
index 0000000..943a5c5
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/subtract_lsx.c
@@ -0,0 +1,371 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void sub_blk_4x4_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3;
+  __m128i pred0, pred1, pred2, pred3;
+  __m128i diff0, diff1;
+  __m128i reg0, reg1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t diff_stride2 = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t diff_stride3 = diff_stride2 + diff_stride;
+
+  DUP4_ARG2(__lsx_vldrepl_w, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_w, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, pred1, pred0, pred3, pred2,
+            src0, src2, pred0, pred2);
+  DUP2_ARG2(__lsx_vilvl_d, src2, src0, pred2, pred0, src0, pred0);
+  reg0 = __lsx_vilvl_b(src0, pred0);
+  reg1 = __lsx_vilvh_b(src0, pred0);
+  DUP2_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, diff0, diff1);
+  __lsx_vstelm_d(diff0, diff_ptr, 0, 0);
+  __lsx_vstelm_d(diff0, diff_ptr + diff_stride, 0, 1);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride2, 0, 0);
+  __lsx_vstelm_d(diff1, diff_ptr + diff_stride3, 0, 1);
+}
+
+static void sub_blk_8x8_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                            const uint8_t *pred_ptr, int32_t pred_stride,
+                            int16_t *diff_ptr, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+            src2, src3);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred0,
+            pred1, pred2, pred3);
+  src_ptr += src_stride4;
+  pred_ptr += pred_stride4;
+
+  DUP4_ARG2(__lsx_vldrepl_d, src_ptr, 0, src_ptr + src_stride, 0,
+            src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src4, src5,
+            src6, src7);
+  DUP4_ARG2(__lsx_vldrepl_d, pred_ptr, 0, pred_ptr + pred_stride, 0,
+            pred_ptr + pred_stride2, 0, pred_ptr + pred_stride3, 0, pred4,
+            pred5, pred6, pred7);
+
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg1, reg2, reg3);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            reg4, reg5, reg6, reg7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  __lsx_vst(src0, diff_ptr, 0);
+  __lsx_vstx(src1, diff_ptr, dst_stride);
+  __lsx_vstx(src2, diff_ptr, dst_stride2);
+  __lsx_vstx(src3, diff_ptr, dst_stride3);
+  diff_ptr += dst_stride2;
+  __lsx_vst(src4, diff_ptr, 0);
+  __lsx_vstx(src5, diff_ptr, dst_stride);
+  __lsx_vstx(src6, diff_ptr, dst_stride2);
+  __lsx_vstx(src7, diff_ptr, dst_stride3);
+}
+
+static void sub_blk_16x16_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t dst_stride = diff_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+  int16_t *diff_tmp = diff + 8;
+
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  DUP2_ARG2(__lsx_vld, src, 0, pred, 0, src0, pred0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP4_ARG2(__lsx_vldx, pred, pred_stride, pred, pred_stride2, pred,
+            pred_stride3, pred, pred_stride4, pred1, pred2, pred3, pred4);
+  src += src_stride4;
+  pred += pred_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            pred, pred_stride, src5, src6, src7, pred5);
+  DUP2_ARG2(__lsx_vldx, pred, pred_stride2, pred, pred_stride3, pred6, pred7);
+  DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg0, reg2, reg4, reg6);
+  DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+            reg1, reg3, reg5, reg7);
+  DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp0, tmp2, tmp4, tmp6);
+  DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+            tmp1, tmp3, tmp5, tmp7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3, reg3,
+            src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7, reg7,
+            src4, src5, src6, src7);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3, tmp3,
+            pred0, pred1, pred2, pred3);
+  DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7, tmp7,
+            pred4, pred5, pred6, pred7);
+  __lsx_vst(src0, diff, 0);
+  __lsx_vstx(src2, diff, dst_stride);
+  __lsx_vstx(src4, diff, dst_stride2);
+  __lsx_vstx(src6, diff, dst_stride3);
+  __lsx_vst(src1, diff_tmp, 0);
+  __lsx_vstx(src3, diff_tmp, dst_stride);
+  __lsx_vstx(src5, diff_tmp, dst_stride2);
+  __lsx_vstx(src7, diff_tmp, dst_stride3);
+  diff += dst_stride2;
+  diff_tmp += dst_stride2;
+  __lsx_vst(pred0, diff, 0);
+  __lsx_vstx(pred2, diff, dst_stride);
+  __lsx_vstx(pred4, diff, dst_stride2);
+  __lsx_vstx(pred6, diff, dst_stride3);
+  __lsx_vst(pred1, diff_tmp, 0);
+  __lsx_vstx(pred3, diff_tmp, dst_stride);
+  __lsx_vstx(pred5, diff_tmp, dst_stride2);
+  __lsx_vstx(pred7, diff_tmp, dst_stride3);
+}
+
+static void sub_blk_32x32_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t pred_stride2 = pred_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t pred_stride3 = pred_stride2 + pred_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t pred_stride4 = pred_stride2 << 1;
+
+  for (loop_cnt = 8; loop_cnt--;) {
+    const uint8_t *src_tmp = src + 16;
+    const uint8_t *pred_tmp = pred + 16;
+    DUP4_ARG2(__lsx_vld, src, 0, src_tmp, 0, pred, 0, pred_tmp, 0, src0, src1,
+              pred0, pred1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP4_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, pred,
+              pred_stride, pred_tmp, pred_stride, src6, src7, pred2, pred3);
+    DUP4_ARG2(__lsx_vldx, pred, pred_stride2, pred_tmp, pred_stride2, pred,
+              pred_stride3, pred_tmp, pred_stride3, pred4, pred5, pred6, pred7);
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    src += src_stride4;
+    pred += pred_stride4;
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(src4, diff, 0);
+    __lsx_vst(src5, diff, 16);
+    __lsx_vst(src6, diff, 32);
+    __lsx_vst(src7, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    diff += diff_stride;
+    __lsx_vst(pred4, diff, 0);
+    __lsx_vst(pred5, diff, 16);
+    __lsx_vst(pred6, diff, 32);
+    __lsx_vst(pred7, diff, 48);
+    diff += diff_stride;
+  }
+}
+
+static void sub_blk_64x64_lsx(const uint8_t *src, int32_t src_stride,
+                              const uint8_t *pred, int32_t pred_stride,
+                              int16_t *diff, int32_t diff_stride) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i pred0, pred1, pred2, pred3, pred4, pred5, pred6, pred7;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5, reg6, reg7;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  uint32_t loop_cnt;
+
+  for (loop_cnt = 32; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred0, pred1,
+              pred2, pred3);
+    src += src_stride;
+    pred += pred_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    DUP4_ARG2(__lsx_vld, pred, 0, pred, 16, pred, 32, pred, 48, pred4, pred5,
+              pred6, pred7);
+    src += src_stride;
+    pred += pred_stride;
+
+    DUP4_ARG2(__lsx_vilvl_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg0, reg2, reg4, reg6);
+    DUP4_ARG2(__lsx_vilvh_b, src0, pred0, src1, pred1, src2, pred2, src3, pred3,
+              reg1, reg3, reg5, reg7);
+    DUP4_ARG2(__lsx_vilvl_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp0, tmp2, tmp4, tmp6);
+    DUP4_ARG2(__lsx_vilvh_b, src4, pred4, src5, pred5, src6, pred6, src7, pred7,
+              tmp1, tmp3, tmp5, tmp7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg0, reg0, reg1, reg1, reg2, reg2, reg3,
+              reg3, src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, reg4, reg4, reg5, reg5, reg6, reg6, reg7,
+              reg7, src4, src5, src6, src7);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, tmp3,
+              tmp3, pred0, pred1, pred2, pred3);
+    DUP4_ARG2(__lsx_vhsubw_hu_bu, tmp4, tmp4, tmp5, tmp5, tmp6, tmp6, tmp7,
+              tmp7, pred4, pred5, pred6, pred7);
+    __lsx_vst(src0, diff, 0);
+    __lsx_vst(src1, diff, 16);
+    __lsx_vst(src2, diff, 32);
+    __lsx_vst(src3, diff, 48);
+    __lsx_vst(src4, diff, 64);
+    __lsx_vst(src5, diff, 80);
+    __lsx_vst(src6, diff, 96);
+    __lsx_vst(src7, diff, 112);
+    diff += diff_stride;
+    __lsx_vst(pred0, diff, 0);
+    __lsx_vst(pred1, diff, 16);
+    __lsx_vst(pred2, diff, 32);
+    __lsx_vst(pred3, diff, 48);
+    __lsx_vst(pred4, diff, 64);
+    __lsx_vst(pred5, diff, 80);
+    __lsx_vst(pred6, diff, 96);
+    __lsx_vst(pred7, diff, 112);
+    diff += diff_stride;
+  }
+}
+
+void vpx_subtract_block_lsx(int32_t rows, int32_t cols, int16_t *diff_ptr,
+                            ptrdiff_t diff_stride, const uint8_t *src_ptr,
+                            ptrdiff_t src_stride, const uint8_t *pred_ptr,
+                            ptrdiff_t pred_stride) {
+  if (rows == cols) {
+    switch (rows) {
+      case 4:
+        sub_blk_4x4_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 8:
+        sub_blk_8x8_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                        diff_stride);
+        break;
+      case 16:
+        sub_blk_16x16_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 32:
+        sub_blk_32x32_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      case 64:
+        sub_blk_64x64_lsx(src_ptr, src_stride, pred_ptr, pred_stride, diff_ptr,
+                          diff_stride);
+        break;
+      default:
+        vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr,
+                             src_stride, pred_ptr, pred_stride);
+        break;
+    }
+  } else {
+    vpx_subtract_block_c(rows, cols, diff_ptr, diff_stride, src_ptr, src_stride,
+                         pred_ptr, pred_stride);
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h b/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
new file mode 100644
index 0000000..bd51483
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/txfm_macros_lsx.h
@@ -0,0 +1,48 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define DOTP_CONST_PAIR(reg0, reg1, cnst0, cnst1, out0, out1)         \
+  do {                                                                \
+    __m128i s0_m, s1_m, s2_m, s3_m, s4_m, s5_m;                       \
+    __m128i k0_m, k1_m, k2_m, k3_m;                                   \
+                                                                      \
+    k0_m = __lsx_vreplgr2vr_h(cnst0);                                 \
+    k1_m = __lsx_vreplgr2vr_h(cnst1);                                 \
+    k2_m = __lsx_vpackev_h(k1_m, k0_m);                               \
+                                                                      \
+    DUP2_ARG2(__lsx_vilvl_h, reg1, reg0, reg0, reg1, s5_m, s3_m);     \
+    DUP2_ARG2(__lsx_vilvh_h, reg1, reg0, reg0, reg1, s4_m, s2_m);     \
+                                                                      \
+    DUP2_ARG2(__lsx_vmulwev_w_h, s5_m, k0_m, s4_m, k0_m, s1_m, s0_m); \
+    k3_m = __lsx_vmulwod_w_h(s5_m, k1_m);                             \
+    s1_m = __lsx_vsub_w(s1_m, k3_m);                                  \
+    k3_m = __lsx_vmulwod_w_h(s4_m, k1_m);                             \
+    s0_m = __lsx_vsub_w(s0_m, k3_m);                                  \
+                                                                      \
+    out0 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+                                                                      \
+    DUP2_ARG2(__lsx_vdp2_w_h, s3_m, k2_m, s2_m, k2_m, s1_m, s0_m);    \
+    out1 = __lsx_vssrarni_h_w(s0_m, s1_m, DCT_CONST_BITS);            \
+  } while (0)
+
+#define DOT_SHIFT_RIGHT_PCK_H(in0, in1, in2, in3)                \
+  do {                                                           \
+    __m128i tp0_m, tp1_m;                                        \
+                                                                 \
+    DUP2_ARG2(__lsx_vdp2_w_h, in0, in2, in1, in2, tp1_m, tp0_m); \
+    in3 = __lsx_vssrarni_h_w(tp1_m, tp0_m, DCT_CONST_BITS);      \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_TXFM_MACROS_LSX_H_
diff --git a/libvpx/vpx_dsp/loongarch/variance_lsx.c b/libvpx/vpx_dsp/loongarch/variance_lsx.c
new file mode 100644
index 0000000..8fad342
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/variance_lsx.c
@@ -0,0 +1,263 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/variance_lsx.h"
+
+#define VARIANCE_WxH(sse, diff, shift) \
+  (sse) - (((uint32_t)(diff) * (diff)) >> (shift))
+
+#define VARIANCE_LARGE_WxH(sse, diff, shift) \
+  (sse) - (((int64_t)(diff) * (diff)) >> (shift))
+
+static uint32_t sse_diff_8width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                    const uint8_t *ref_ptr, int32_t ref_stride,
+                                    int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, ref0, ref1, ref2, ref3, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t ref_stride2 = ref_stride << 1;
+  int32_t ref_stride3 = ref_stride2 + ref_stride;
+  int32_t ref_stride4 = ref_stride2 << 1;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr + src_stride, 0,
+              src_ptr + src_stride2, 0, src_ptr + src_stride3, 0, src0, src1,
+              src2, src3);
+    src_ptr += src_stride4;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr + ref_stride, 0,
+              ref_ptr + ref_stride2, 0, ref_ptr + ref_stride3, 0, ref0, ref1,
+              ref2, ref3);
+    ref_ptr += ref_stride4;
+
+    DUP4_ARG2(__lsx_vpickev_d, src1, src0, src3, src2, ref1, ref0, ref3, ref2,
+              src0, src1, ref0, ref1);
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref, vec;
+  __m128i avg = __lsx_vldi(0);
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+
+    src = __lsx_vld(src_ptr, 0);
+    src_ptr += src_stride;
+    ref = __lsx_vld(ref_ptr, 0);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src, ref, var, avg);
+  }
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_32width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                     const uint8_t *ref_ptr, int32_t ref_stride,
+                                     int32_t height, int32_t *diff) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i avg = __lsx_vldi(0);
+  __m128i src0, src1, ref0, ref1;
+  __m128i vec;
+  __m128i var = avg;
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src0, src1);
+    src_ptr += src_stride;
+    DUP2_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref0, ref1);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg);
+    CALC_MSE_AVG_B(src1, ref1, var, avg);
+  }
+
+  vec = __lsx_vhaddw_w_h(avg, avg);
+  HADD_SW_S32(vec, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+static uint32_t sse_diff_64x64_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                   const uint8_t *ref_ptr, int32_t ref_stride,
+                                   int32_t *diff) {
+  int32_t res, ht_cnt = 32;
+  __m128i avg0 = __lsx_vldi(0);
+  __m128i src0, src1, src2, src3;
+  __m128i ref0, ref1, ref2, ref3;
+  __m128i vec0, vec1;
+  __m128i avg1 = avg0;
+  __m128i avg2 = avg0;
+  __m128i avg3 = avg0;
+  __m128i var = avg0;
+
+  for (; ht_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+    DUP4_ARG2(__lsx_vld, src_ptr, 0, src_ptr, 16, src_ptr, 32, src_ptr, 48,
+              src0, src1, src2, src3);
+    src_ptr += src_stride;
+    DUP4_ARG2(__lsx_vld, ref_ptr, 0, ref_ptr, 16, ref_ptr, 32, ref_ptr, 48,
+              ref0, ref1, ref2, ref3);
+    ref_ptr += ref_stride;
+    CALC_MSE_AVG_B(src0, ref0, var, avg0);
+    CALC_MSE_AVG_B(src1, ref1, var, avg1);
+    CALC_MSE_AVG_B(src2, ref2, var, avg2);
+    CALC_MSE_AVG_B(src3, ref3, var, avg3);
+  }
+  vec0 = __lsx_vhaddw_w_h(avg0, avg0);
+  vec1 = __lsx_vhaddw_w_h(avg1, avg1);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg2, avg2);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  vec1 = __lsx_vhaddw_w_h(avg3, avg3);
+  vec0 = __lsx_vadd_w(vec0, vec1);
+  HADD_SW_S32(vec0, *diff);
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+#define VARIANCE_8Wx8H(sse, diff) VARIANCE_WxH(sse, diff, 6)
+#define VARIANCE_16Wx16H(sse, diff) VARIANCE_WxH(sse, diff, 8)
+
+#define VARIANCE_32Wx32H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 10)
+#define VARIANCE_64Wx64H(sse, diff) VARIANCE_LARGE_WxH(sse, diff, 12)
+
+#define VPX_VARIANCE_WDXHT_LSX(wd, ht)                                         \
+  uint32_t vpx_variance##wd##x##ht##_lsx(                                      \
+      const uint8_t *src, int32_t src_stride, const uint8_t *ref,              \
+      int32_t ref_stride, uint32_t *sse) {                                     \
+    int32_t diff;                                                              \
+                                                                               \
+    *sse =                                                                     \
+        sse_diff_##wd##width_lsx(src, src_stride, ref, ref_stride, ht, &diff); \
+                                                                               \
+    return VARIANCE_##wd##Wx##ht##H(*sse, diff);                               \
+  }
+
+static uint32_t sse_16width_lsx(const uint8_t *src_ptr, int32_t src_stride,
+                                const uint8_t *ref_ptr, int32_t ref_stride,
+                                int32_t height) {
+  int32_t res, ht_cnt = (height >> 2);
+  __m128i src, ref;
+  __m128i var = __lsx_vldi(0);
+
+  for (; ht_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+
+    DUP2_ARG2(__lsx_vld, src_ptr, 0, ref_ptr, 0, src, ref);
+    src_ptr += src_stride;
+    ref_ptr += ref_stride;
+    CALC_MSE_B(src, ref, var);
+  }
+  HADD_SW_S32(var, res);
+  return res;
+}
+
+VPX_VARIANCE_WDXHT_LSX(8, 8)
+VPX_VARIANCE_WDXHT_LSX(16, 16)
+VPX_VARIANCE_WDXHT_LSX(32, 32)
+
+uint32_t vpx_variance64x64_lsx(const uint8_t *src, int32_t src_stride,
+                               const uint8_t *ref, int32_t ref_stride,
+                               uint32_t *sse) {
+  int32_t diff;
+
+  *sse = sse_diff_64x64_lsx(src, src_stride, ref, ref_stride, &diff);
+
+  return VARIANCE_64Wx64H(*sse, diff);
+}
+
+uint32_t vpx_mse16x16_lsx(const uint8_t *src, int32_t src_stride,
+                          const uint8_t *ref, int32_t ref_stride,
+                          uint32_t *sse) {
+  *sse = sse_16width_lsx(src, src_stride, ref, ref_stride, 16);
+
+  return *sse;
+}
+
+void vpx_get16x16var_lsx(const uint8_t *src, int32_t src_stride,
+                         const uint8_t *ref, int32_t ref_stride, uint32_t *sse,
+                         int32_t *sum) {
+  *sse = sse_diff_16width_lsx(src, src_stride, ref, ref_stride, 16, sum);
+}
diff --git a/libvpx/vpx_dsp/loongarch/variance_lsx.h b/libvpx/vpx_dsp/loongarch/variance_lsx.h
new file mode 100644
index 0000000..cf9e989
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/variance_lsx.h
@@ -0,0 +1,62 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
+
+#include "vpx_util/loongson_intrinsics.h"
+
+#define HADD_SW_S32(in0, in1)                  \
+  do {                                         \
+    __m128i res0_m;                            \
+                                               \
+    res0_m = __lsx_vhaddw_d_w(in0, in0);       \
+    res0_m = __lsx_vhaddw_q_d(res0_m, res0_m); \
+    in1 = __lsx_vpickve2gr_w(res0_m, 0);       \
+  } while (0)
+
+#define HORIZ_2TAP_FILT_UH(in0, in1, mask, coeff, shift, in2) \
+  do {                                                        \
+    __m128i tmp0_m, tmp1_m;                                   \
+                                                              \
+    tmp0_m = __lsx_vshuf_b(in1, in0, mask);                   \
+    tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);                  \
+    in2 = __lsx_vsrari_h(tmp1_m, shift);                      \
+  } while (0)
+
+#define CALC_MSE_B(src, ref, var)                                         \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+  } while (0)
+
+#define CALC_MSE_AVG_B(src, ref, var, sub)                                \
+  do {                                                                    \
+    __m128i src_l0_m, src_l1_m;                                           \
+    __m128i res_l0_m, res_l1_m;                                           \
+                                                                          \
+    src_l0_m = __lsx_vilvl_b(src, ref);                                   \
+    src_l1_m = __lsx_vilvh_b(src, ref);                                   \
+    DUP2_ARG2(__lsx_vhsubw_hu_bu, src_l0_m, src_l0_m, src_l1_m, src_l1_m, \
+              res_l0_m, res_l1_m);                                        \
+    var = __lsx_vdp2add_w_h(var, res_l0_m, res_l0_m);                     \
+    var = __lsx_vdp2add_w_h(var, res_l1_m, res_l1_m);                     \
+    sub = __lsx_vadd_h(sub, res_l0_m);                                    \
+    sub = __lsx_vadd_h(sub, res_l1_m);                                    \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VARIANCE_LSX_H_
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
new file mode 100644
index 0000000..1c59228
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_horiz_lsx.c
@@ -0,0 +1,972 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1;
+  __m128i dst0, dst1, dst2, dst3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  tmp0 = __lsx_vssrarni_b_h(tmp1, tmp0, 7);
+  tmp0 = __lsx_vxori_b(tmp0, 128);
+  dst0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst0 = __lsx_vilvl_d(tmp1, tmp0);
+
+  tmp0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  tmp3 = __lsx_vldrepl_w(dst_tmp, 0);
+  tmp0 = __lsx_vilvl_w(tmp1, tmp0);
+  tmp1 = __lsx_vilvl_w(tmp3, tmp2);
+  dst1 = __lsx_vilvl_d(tmp1, tmp0);
+
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp0, tmp1);
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, tmp2, tmp3);
+  DUP4_ARG3(__lsx_vssrarni_b_h, tmp0, tmp0, 7, tmp1, tmp1, 7, tmp2, tmp2, 7,
+            tmp3, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+  __lsx_vstelm_w(dst0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(dst1, dst, 0, 3);
+}
+
+static void common_hz_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_8t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  int32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, tmp0,
+                               tmp1, tmp2, tmp3);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp1, dst1, dst0, dst1);
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height >> 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1, dst2, dst3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 8, src2, src3);
+    src += src_stride;
+    dst0 = __lsx_vld(dst_tmp, 0);
+    dst1 = __lsx_vldx(dst_tmp, dst_stride);
+    dst_tmp += dst_stride2;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, dst2, dst3);
+    DUP2_ARG2(__lsx_vxori_b, dst2, 128, dst3, 128, dst2, dst3);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, dst2, dst1, dst3, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    dst += dst_stride2;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3, dst0, dst1;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
+  __m128i tmp8, tmp9, tmp10, tmp11, tmp12, tmp13, tmp14, tmp15;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst, 16, dst0, dst1);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask0, src1, src1, mask0, src2, src2,
+              mask0, src3, src3, mask0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask1, src1, src1, mask1, src2, src2,
+              mask1, src3, src3, mask1, tmp4, tmp5, tmp6, tmp7);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask2, src1, src1, mask2, src2, src2,
+              mask2, src3, src3, mask2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask3, src1, src1, mask3, src2, src2,
+              mask3, src3, src3, mask3, tmp12, tmp13, tmp14, tmp15);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp0, filter0, tmp1, filter0, tmp2, filter0, tmp3,
+              filter0, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG2(__lsx_vdp2_h_b, tmp8, filter2, tmp9, filter2, tmp10, filter2,
+              tmp11, filter2, tmp8, tmp9, tmp10, tmp11);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp0, tmp4, filter1, tmp1, tmp5, filter1, tmp2,
+              tmp6, filter1, tmp3, tmp7, filter1, tmp0, tmp1, tmp2, tmp3);
+    DUP4_ARG3(__lsx_vdp2add_h_b, tmp8, tmp12, filter3, tmp9, tmp13, filter3,
+              tmp10, tmp14, filter3, tmp11, tmp15, filter3, tmp4, tmp5, tmp6,
+              tmp7);
+    DUP4_ARG2(__lsx_vsadd_h, tmp0, tmp4, tmp1, tmp5, tmp2, tmp6, tmp3, tmp7,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp1, tmp0, 7, tmp3, tmp2, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vavgr_bu, dst0, tmp0, dst1, tmp1, dst0, dst1);
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3, dst0, dst1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP2_ARG2(__lsx_vld, dst, 32, dst, 48, dst0, dst1);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, dst0, out1, dst1, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec0, vec1);
+  vec0 = __lsx_vssrarni_bu_h(vec1, vec0, FILTER_BITS);
+  vec0 = __lsx_vavgr_bu(vec0, dst0);
+  __lsx_vstelm_w(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(vec0, dst, 0, 3);
+}
+
+static void common_hz_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1, res2, res3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i vec4, vec5, vec6, vec7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + src_stride4;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  src4 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst1, dst0, dst3, dst2, dst0, dst1);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_w, dst2, dst1, dst4, dst3, dst1, dst2);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src5, src4, mask, src7, src6, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+  DUP2_ARG2(__lsx_vilvl_d, res1, res0, res3, res2, res0, res2);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res2, dst1, res0, res2);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res2, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_hz_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec1, dst1, vec0, vec1);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec1, dst, 0, 1);
+}
+
+static void common_hz_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  uint8_t *dst_tmp = dst;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec2);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(vec2, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, vec0, vec2);
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    DUP2_ARG2(__lsx_vavgr_bu, vec0, dst0, vec2, dst1, vec0, vec2);
+    __lsx_vstelm_d(vec0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(vec2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_hz_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_hz_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp1 = (uint8_t *)src + 8;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src_tmp1 += src_stride4;
+
+  DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+  DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+  DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+  DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            res0, res1, res2, res3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            res4, res5, res6, res7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+            FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS, res0,
+            res2, res4, res6);
+  dst0 = __lsx_vld(dst, 0);
+  res0 = __lsx_vavgr_bu(res0, dst0);
+  __lsx_vst(res0, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res2 = __lsx_vavgr_bu(res2, dst0);
+  __lsx_vst(res2, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res4 = __lsx_vavgr_bu(res4, dst0);
+  __lsx_vst(res4, dst, 0);
+  dst += dst_stride;
+
+  dst0 = __lsx_vld(dst, 0);
+  res6 = __lsx_vavgr_bu(res6, dst0);
+  __lsx_vst(res6, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src_tmp1 += src_stride4;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+    dst0 = __lsx_vld(dst, 0);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst0);
+    __lsx_vst(res2, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    dst += dst_stride;
+
+    dst0 = __lsx_vld(dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst0);
+    __lsx_vst(res6, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i res0, res1, res2, res3, res4, res5, res6, res7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src2, src3);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 16, src, 24, src6, src7);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, res0, res1, res2, res3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, res4, res5, res6, res7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, res1, res0, FILTER_BITS, res3, res2,
+              FILTER_BITS, res5, res4, FILTER_BITS, res7, res6, FILTER_BITS,
+              res0, res2, res4, res6);
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res0 = __lsx_vavgr_bu(res0, dst0);
+    __lsx_vst(res0, dst, 0);
+    res2 = __lsx_vavgr_bu(res2, dst1);
+    __lsx_vst(res2, dst, 16);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vld, dst, 0, dst, 16, dst0, dst1);
+    res4 = __lsx_vavgr_bu(res4, dst0);
+    __lsx_vst(res4, dst, 0);
+    res6 = __lsx_vavgr_bu(res6, dst1);
+    __lsx_vst(res6, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, dst0, dst1, dst2, dst3;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP2_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, vec0, vec1);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src2, mask, src3, src3, mask, vec2, vec3);
+    DUP2_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, vec4, vec5);
+    DUP2_ARG3(__lsx_vshuf_b, src6, src6, mask, src7, src7, mask, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out2, out4, out6);
+
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst1, dst2,
+              dst3);
+    out0 = __lsx_vavgr_bu(out0, dst0);
+    __lsx_vst(out0, dst, 0);
+    out2 = __lsx_vavgr_bu(out2, dst1);
+    __lsx_vst(out2, dst, 16);
+    out4 = __lsx_vavgr_bu(out4, dst2);
+    __lsx_vst(out4, dst, 32);
+    out6 = __lsx_vavgr_bu(out6, dst3);
+    __lsx_vst(out6, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_avg_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                 uint8_t *dst, ptrdiff_t dst_stride,
+                                 const InterpKernel *filter, int x0_q4,
+                                 int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                 int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+
+      case 32:
+        common_hz_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 16:
+        common_hz_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 32:
+        common_hz_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      case 64:
+        common_hz_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_avg_horiz_c(src, src_stride, dst, dst_stride, filter,
+                                  x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
new file mode 100644
index 0000000..d1abf62
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_lsx.c
@@ -0,0 +1,737 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src4 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src5 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src3, src2, src5, src4, src2, src3);
+    src2 = __lsx_vilvl_d(src3, src2);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, FILTER_BITS);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src2);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *_src = (uint8_t *)src - 3 - src_stride3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, FILTER_BITS, src4, src3,
+              FILTER_BITS, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    src5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src7 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src8 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src9 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src7, src5, src9, src8, src5, src7);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src5, out1, src7, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                        filter_horiz, filter_vert, height);
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter_horiz, const int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, src_stride, dst, dst_stride,
+                                          filter_horiz, filter_vert, height);
+
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+
+  dst0 = __lsx_vldrepl_w(dst, 0);
+  dst1 = __lsx_vldrepl_w(dst + dst_stride, 0);
+  dst2 = __lsx_vldrepl_w(dst + dst_stride2, 0);
+  dst3 = __lsx_vldrepl_w(dst + dst_stride3, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4x8_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3, res0, res1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, res0, res1);
+  DUP2_ARG2(__lsx_vavgr_bu, res0, dst0, res1, dst1, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 3);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_4w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x4_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+  AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *dst_tmp = dst;
+
+  /* rearranging filter */
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp1);
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+    AVG_ST4_D(tmp0, tmp1, dst0, dst1, dst, dst_stride);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_8w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert);
+  } else {
+    common_hv_2ht_2vt_and_aver_dst_8x8mult_lsx(
+        src, src_stride, dst, dst_stride, filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_16w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  uint8_t *src_tmp1;
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, dst0, dst1, dst2, dst3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, tmp0, tmp1, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride << 2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+    src_tmp1 = (uint8_t *)(src + 8);
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst0);
+    __lsx_vst(tmp3, dst, 0);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst1);
+    __lsx_vstx(tmp3, dst, dst_stride);
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst2);
+    __lsx_vstx(tmp3, dst, dst_stride2);
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+    tmp3 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp3 = __lsx_vavgr_bu(tmp3, dst3);
+    __lsx_vstx(tmp3, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_32w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                         filter_horiz, filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_and_aver_dst_64w_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter_horiz, int8_t *filter_vert, int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, src_stride, dst, dst_stride,
+                                           filter_horiz, filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4, int x_step_q4,
+                           int y0_q4, int y_step_q4, int w, int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, &filt_hor[3],
+                                              &filt_ver[3], h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride,
+                                               &filt_hor[3], &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                              (int32_t)dst_stride, filt_hor,
+                                              filt_ver, h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                               (int32_t)dst_stride, filt_hor,
+                                               filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                            x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
new file mode 100644
index 0000000..5c6413d
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_avg_vert_lsx.c
@@ -0,0 +1,918 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_w(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_w, src1, src0, src3, src2, src0, src1);
+    src0 = __lsx_vilvl_d(src1, src0);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    out0 = __lsx_vavgr_bu(out0, src0);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride,
+                                             const int8_t *filter,
+                                             int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  uint8_t *dst_tmp = dst;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src1,
+            src2);
+  src3 = __lsx_vldx(src_tmp0, src_stride3);
+  src_tmp0 += src_stride4;
+  src4 = __lsx_vld(src_tmp0, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src5,
+            src6);
+  src_tmp0 += src_stride3;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src_tmp0, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp0, src_stride, src_tmp0, src_stride2, src8,
+              src9);
+    src10 = __lsx_vldx(src_tmp0, src_stride3);
+    src_tmp0 += src_stride4;
+    src0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    src3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, src1, src0, src3, src2, src0, src1);
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    DUP2_ARG2(__lsx_vavgr_bu, out0, src0, out1, src1, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    const int8_t *filter, int32_t height, int32_t width) {
+  uint8_t *src_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *src_tmp0 = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+    uint8_t *dst_reg = dst;
+
+    src_tmp = src_tmp0;
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vld(dst_reg, 0);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_reg, 0);
+      __lsx_vstx(tmp1, dst_reg, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      tmp2 = __lsx_vldx(dst_reg, dst_stride2);
+      tmp3 = __lsx_vldx(dst_reg, dst_stride3);
+      DUP2_ARG2(__lsx_vavgr_bu, tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_reg, dst_stride2);
+      __lsx_vstx(tmp1, dst_reg, dst_stride3);
+      dst_reg += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src_tmp0 += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 16);
+}
+
+static void common_vt_8t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 32);
+}
+
+static void common_vt_8t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              const int8_t *filter,
+                                              int32_t height) {
+  common_vt_8t_and_aver_dst_16w_mult_lsx(src, src_stride, dst, dst_stride,
+                                         filter, height, 64);
+}
+
+static void common_vt_2t_and_aver_dst_4x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, out, filt0, src2110, src4332;
+  __m128i src10_r, src32_r, src21_r, src43_r;
+  __m128i tmp0, tmp1;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP2_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src2110,
+            src4332);
+  DUP2_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+  out = __lsx_vavgr_bu(tmp0, dst0);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+  dst += dst_stride;
+}
+
+static void common_vt_2t_and_aver_dst_4x8_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i dst0, dst1, dst2, dst3, dst4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src87_r;
+  __m128i src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
+  __m128i src2110, src4332, src6554, src8776, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src7 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src8 = __lsx_vld(src, 0);
+
+  dst0 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst0 = __lsx_vilvl_w(dst1, dst0);
+  dst1 = __lsx_vilvl_w(dst3, dst2);
+  dst0 = __lsx_vilvl_d(dst1, dst0);
+
+  dst1 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst4 = __lsx_vldrepl_w(dst_tmp, 0);
+  dst1 = __lsx_vilvl_w(dst2, dst1);
+  dst2 = __lsx_vilvl_w(dst4, dst3);
+  dst1 = __lsx_vilvl_d(dst2, dst1);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+            src10_r, src21_r, src32_r, src43_r);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+            src54_r, src65_r, src76_r, src87_r);
+  DUP4_ARG2(__lsx_vilvl_d, src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
+            src87_r, src76_r, src2110, src4332, src6554, src8776);
+  DUP4_ARG2(__lsx_vdp2_h_bu, src2110, filt0, src4332, filt0, src6554, filt0,
+            src8776, filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp0, dst, 0, 3);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(tmp2, dst, 0, 3);
+}
+
+static void common_vt_2t_and_aver_dst_4w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_and_aver_dst_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8x4_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i dst0, dst1, dst2, dst3, vec0, vec1, vec2, vec3, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+  dst_tmp += dst_stride;
+  DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+  DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec1);
+  DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp2);
+  DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(tmp2, dst, 0, 1);
+}
+
+static void common_vt_2t_and_aver_dst_8x8mult_lsx(
+    const uint8_t *src, int32_t src_stride, uint8_t *dst, int32_t dst_stride,
+    int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  uint8_t *dst_tmp = dst;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7);
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst1 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst1, dst0, dst3, dst2, dst0, dst1);
+
+    dst2 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst3 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst4 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    dst5 = __lsx_vldrepl_d(dst_tmp, 0);
+    dst_tmp += dst_stride;
+    DUP2_ARG2(__lsx_vilvl_d, dst3, dst2, dst5, dst4, dst2, dst3);
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst0, tmp2, dst1, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, tmp0, tmp2);
+    DUP2_ARG2(__lsx_vavgr_bu, tmp0, dst2, tmp2, dst3, tmp0, tmp2);
+    __lsx_vstelm_d(tmp0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_8w_lsx(const uint8_t *src,
+                                             int32_t src_stride, uint8_t *dst,
+                                             int32_t dst_stride, int8_t *filter,
+                                             int32_t height) {
+  if (height == 4) {
+    common_vt_2t_and_aver_dst_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_and_aver_dst_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                          filter, height);
+  }
+}
+
+static void common_vt_2t_and_aver_dst_16w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, dst0, dst1, dst2, dst3, filt0;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_32w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp0, tmp1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+
+    src_tmp1 = src + 16;
+    src6 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src7,
+              src8);
+    src9 = __lsx_vldx(src_tmp1, src_stride3);
+
+    dst_tmp1 = dst + 16;
+    dst4 = __lsx_vld(dst_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, dst_tmp1, dst_stride, dst_tmp1, dst_stride2, dst5,
+              dst6);
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vstx(tmp0, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    dst += dst_stride;
+    __lsx_vst(tmp0, dst, 16);
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_and_aver_dst_64w_lsx(const uint8_t *src,
+                                              int32_t src_stride, uint8_t *dst,
+                                              int32_t dst_stride,
+                                              int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *src_tmp1;
+  uint8_t *dst_tmp1;
+  __m128i src0, src1, src2, src3, src4, src5;
+  __m128i src6, src7, src8, src9, src10, src11, filt0;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i tmp0, tmp1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src2 = __lsx_vldx(src, src_stride);
+    dst1 = __lsx_vldx(dst, dst_stride);
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, dst, 0, dst, 16, dst, 32, dst, 48, dst0, dst2, dst4,
+              dst6);
+    src_tmp1 = (uint8_t *)src + 16;
+    src5 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src8 = __lsx_vldx(src_tmp1, src_stride);
+    src_tmp1 = src_tmp1 + 16;
+    src11 = __lsx_vldx(src_tmp1, src_stride);
+
+    dst_tmp1 = dst + 16;
+    dst3 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 32;
+    dst5 = __lsx_vldx(dst_tmp1, dst_stride);
+    dst_tmp1 = dst + 48;
+    dst7 = __lsx_vldx(dst_tmp1, dst_stride);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst0);
+    __lsx_vst(tmp0, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst1);
+    __lsx_vstx(tmp0, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst2);
+    __lsx_vst(tmp0, dst, 16);
+
+    dst_tmp1 = dst + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst3);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst4);
+    __lsx_vst(tmp0, dst, 32);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst5);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst6);
+    __lsx_vst(tmp0, dst, 48);
+
+    dst_tmp1 = dst_tmp1 + 16;
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    tmp0 = __lsx_vavgr_bu(tmp0, dst7);
+    __lsx_vstx(tmp0, dst_tmp1, dst_stride);
+    dst += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_avg_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                                uint8_t *dst, ptrdiff_t dst_stride,
+                                const InterpKernel *filter, int x0_q4,
+                                int x_step_q4, int y0_q4, int y_step_q4, int w,
+                                int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_and_aver_dst_4w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_and_aver_dst_8w_lsx(src, (int32_t)src_stride, dst,
+                                         (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_and_aver_dst_16w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+
+        break;
+      case 32:
+        common_vt_8t_and_aver_dst_32w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_and_aver_dst_64w_lsx(src, (int32_t)src_stride, dst,
+                                          (int32_t)dst_stride, filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_avg_vert_c(src, src_stride, dst, dst_stride, filter,
+                                 x0_q4, x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
new file mode 100644
index 0000000..2c6459a
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_horiz_lsx.c
@@ -0,0 +1,814 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hz_8t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out, out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= 3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  out = __lsx_vssrarni_b_h(out1, out0, 7);
+  out = __lsx_vxori_b(out, 128);
+  __lsx_vstelm_w(out, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out, dst, 0, 3);
+}
+
+static void common_hz_8t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+
+  __lsx_vstelm_w(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out0, dst, 0, 3);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 2);
+  dst += dst_stride;
+  __lsx_vstelm_w(out1, dst, 0, 3);
+}
+
+static void common_hz_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_8t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_8t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter) {
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2, mask3,
+                             filter0, filter1, filter2, filter3, out0, out1,
+                             out2, out3);
+  DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+  DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+}
+
+static void common_hz_8t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  uint8_t *_src = (uint8_t *)src - 3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    src0 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                const int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_8t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_8t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  int32_t stride = src_stride << 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    const uint8_t *_src = src + src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, _src, 0, src0, src2);
+    DUP2_ARG2(__lsx_vld, src, 8, _src, 8, src1, src3);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    src += stride;
+  }
+}
+
+static void common_hz_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 1;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+
+    dst += dst_stride;
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 const int8_t *filter, int32_t height) {
+  int32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i out0, out1, out2, out3;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= 3;
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+
+    DUP2_ARG2(__lsx_vld, src, 32, src, 48, src0, src2);
+    src3 = __lsx_vld(src, 56);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
+                               mask3, filter0, filter1, filter2, filter3, out0,
+                               out1, out2, out3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vst(out0, dst, 32);
+    __lsx_vst(out1, dst, 48);
+    src += src_stride;
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, res0, res1;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  DUP2_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec2, vec2, FILTER_BITS, vec3, vec3,
+            FILTER_BITS, res0, res1);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  __lsx_vstelm_w(res0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i res0, res1, res2, res3, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride + dst_stride2;
+
+  uint8_t *src_tmp1 = src + src_stride4;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src5,
+            src6);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src1, src0, mask, src3, src2, mask, src5, src4, mask,
+            src7, src6, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, res0,
+            res1, res2, res3);
+
+  __lsx_vstelm_w(res0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_w(res1, dst, 0, 1);
+  dst += dst_stride;
+
+  __lsx_vstelm_w(res2, dst, 0, 0);
+  __lsx_vstelm_w(res2, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(res3, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(res3, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_hz_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_hz_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, vec0, vec1);
+
+  __lsx_vstelm_d(vec0, dst, 0, 0);
+  __lsx_vstelm_d(vec0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(vec1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(vec1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hz_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  __m128i filt0, mask;
+  __m128i src0, src1, src2, src3, out0, out1;
+  __m128i vec0, vec1, vec2, vec3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            vec0, vec1, vec2, vec3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out0, dst, 0, 1);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 0);
+  dst += dst_stride;
+  __lsx_vstelm_d(out1, dst, 0, 1);
+  dst += dst_stride;
+
+  if (height == 16) {
+    uint8_t *dst_tmp1 = dst + dst_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, vec0, vec1, vec2, vec3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, vec1, vec0, FILTER_BITS, vec3, vec2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst_tmp1, 0, 0);
+    __lsx_vstelm_d(out0, dst_tmp1 + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst_tmp1 + dst_stride3, 0, 1);
+  }
+}
+
+static void common_hz_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_hz_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_hz_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_hz_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2) - 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  uint8_t *src_tmp1 = src + 8;
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+  src6 = __lsx_vldx(src, src_stride3);
+  src1 = __lsx_vld(src_tmp1, 0);
+  DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+            src5);
+  src7 = __lsx_vldx(src_tmp1, src_stride3);
+  src += src_stride4;
+
+  DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2, mask,
+            src3, src3, mask, vec0, vec1, vec2, vec3);
+  DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6, mask,
+            src7, src7, mask, vec4, vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            out0, out1, out2, out3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7, filt0,
+            out4, out5, out6, out7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+            FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS, out0,
+            out1, out2, out3);
+
+  __lsx_vst(out0, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out1, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out2, dst, 0);
+  dst += dst_stride;
+  __lsx_vst(out3, dst, 0);
+  dst += dst_stride;
+
+  for (; loop_cnt--;) {
+    src_tmp1 += src_stride4;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src4);
+    src6 = __lsx_vldx(src, src_stride3);
+
+    src1 = __lsx_vld(src_tmp1, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp1, src_stride, src_tmp1, src_stride2, src3,
+              src5);
+    src7 = __lsx_vldx(src_tmp1, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out1, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out2, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(out3, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src2);
+    src3 = __lsx_vld(src, 24);
+    src1 = __lsx_vshuf_b(src2, src0, shuff);
+    src += src_stride;
+    DUP2_ARG2(__lsx_vld, src, 0, src, 16, src4, src6);
+    src7 = __lsx_vld(src, 24);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    dst += dst_stride;
+
+    __lsx_vst(out2, dst, 0);
+    __lsx_vst(out3, dst, 16);
+    dst += dst_stride;
+  }
+}
+
+static void common_hz_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt0, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
+  __m128i out0, out1, out2, out3, out4, out5, out6, out7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  for (; loop_cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src2, src4,
+              src6);
+    src7 = __lsx_vld(src, 56);
+    DUP2_ARG3(__lsx_vshuf_b, src2, src0, shuff, src4, src2, shuff, src1, src3);
+    src5 = __lsx_vshuf_b(src6, src4, shuff);
+    src += src_stride;
+
+    DUP4_ARG3(__lsx_vshuf_b, src0, src0, mask, src1, src1, mask, src2, src2,
+              mask, src3, src3, mask, vec0, vec1, vec2, vec3);
+    DUP4_ARG3(__lsx_vshuf_b, src4, src4, mask, src5, src5, mask, src6, src6,
+              mask, src7, src7, mask, vec4, vec5, vec6, vec7);
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, out0, out1, out2, out3);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, out4, out5, out6, out7);
+    DUP4_ARG3(__lsx_vssrarni_bu_h, out1, out0, FILTER_BITS, out3, out2,
+              FILTER_BITS, out5, out4, FILTER_BITS, out7, out6, FILTER_BITS,
+              out0, out1, out2, out3);
+
+    __lsx_vst(out0, dst, 0);
+    __lsx_vst(out1, dst, 16);
+    __lsx_vst(out2, dst, 32);
+    __lsx_vst(out3, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve8_horiz_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                             uint8_t *dst, ptrdiff_t dst_stride,
+                             const InterpKernel *filter, int x0_q4,
+                             int x_step_q4, int y0_q4, int y_step_q4, int w,
+                             int h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  int8_t cnt, filt_hor[8];
+
+  assert(x_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+  }
+  if (vpx_get_filter_taps(filter_x) == 2) {
+    switch (w) {
+      case 4:
+        common_hz_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 8:
+        common_hz_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_hor[3], h);
+        break;
+      case 16:
+        common_hz_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 32:
+        common_hz_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      case 64:
+        common_hz_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_hor[3], h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_hz_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+      case 8:
+        common_hz_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_hor, h);
+        break;
+
+      case 16:
+        common_hz_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 32:
+        common_hz_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+
+      case 64:
+        common_hz_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_hor, h);
+        break;
+      default:
+        vpx_convolve8_horiz_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                              x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
new file mode 100644
index 0000000..9f5cd6c
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_lsx.c
@@ -0,0 +1,697 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static const uint8_t mc_filt_mask_arr[16 * 3] = {
+  /* 8 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
+  /* 4 width cases */
+  0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
+  /* 4 width cases */
+  8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
+};
+
+static void common_hv_8ht_8vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i out0, out1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 16);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  tmp0 = horiz_8tap_filt(src0, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp2 = horiz_8tap_filt(src2, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp4 = horiz_8tap_filt(src4, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  tmp5 = horiz_8tap_filt(src5, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG3(__lsx_vshuf_b, tmp2, tmp0, shuff, tmp4, tmp2, shuff, tmp1, tmp3);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP2_ARG2(__lsx_vpackev_b, tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
+  tmp2 = __lsx_vpackev_b(tmp5, tmp4);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    tmp3 = horiz_8tap_filt(src7, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp4 = __lsx_vshuf_b(tmp3, tmp5, shuff);
+    tmp4 = __lsx_vpackev_b(tmp3, tmp4);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp4, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src1 = horiz_8tap_filt(src9, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vshuf_b(src1, tmp3, shuff);
+    src0 = __lsx_vpackev_b(src1, src0);
+    out1 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp4, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    tmp5 = src1;
+    tmp0 = tmp2;
+    tmp1 = tmp4;
+    tmp2 = src0;
+  }
+}
+
+static void common_hv_8ht_8vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filt_hz0, filt_hz1, filt_hz2, filt_hz3;
+  __m128i filt_vt0, filt_vt1, filt_vt2, filt_vt3;
+  __m128i mask0, mask1, mask2, mask3;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
+  __m128i out0, out1;
+
+  mask0 = __lsx_vld(mc_filt_mask_arr, 0);
+  src -= (3 + 3 * src_stride);
+  DUP4_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_horiz, 2, filter_horiz, 4,
+            filter_horiz, 6, filt_hz0, filt_hz1, filt_hz2, filt_hz3);
+  DUP2_ARG2(__lsx_vaddi_bu, mask0, 2, mask0, 4, mask1, mask2);
+  mask3 = __lsx_vaddi_bu(mask0, 6);
+
+  LSX_LD_4(src, src_stride, src0, src1, src2, src3);
+  src += src_stride;
+  src4 = __lsx_vld(src, 0);
+  src += src_stride;
+  src5 = __lsx_vld(src, 0);
+  src += src_stride;
+  src6 = __lsx_vld(src, 0);
+  src += src_stride;
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+
+  src0 = horiz_8tap_filt(src0, src0, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src1 = horiz_8tap_filt(src1, src1, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src2 = horiz_8tap_filt(src2, src2, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src3 = horiz_8tap_filt(src3, src3, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src4 = horiz_8tap_filt(src4, src4, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src5 = horiz_8tap_filt(src5, src5, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+  src6 = horiz_8tap_filt(src6, src6, mask0, mask1, mask2, mask3, filt_hz0,
+                         filt_hz1, filt_hz2, filt_hz3);
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter_vert, 0, filter_vert, 2, filter_vert, 4,
+            filter_vert, 6, filt_vt0, filt_vt1, filt_vt2, filt_vt3);
+  DUP4_ARG2(__lsx_vpackev_b, src1, src0, src3, src2, src5, src4, src2, src1,
+            tmp0, tmp1, tmp2, tmp4);
+  DUP2_ARG2(__lsx_vpackev_b, src4, src3, src6, src5, tmp5, tmp6);
+
+  for (; loop_cnt--;) {
+    LSX_LD_4(src, src_stride, src7, src8, src9, src10);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    src7 = horiz_8tap_filt(src7, src7, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    tmp3 = __lsx_vpackev_b(src7, src6);
+    out0 = filt_8tap_dpadd_s_h(tmp0, tmp1, tmp2, tmp3, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src8 = horiz_8tap_filt(src8, src8, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src0 = __lsx_vpackev_b(src8, src7);
+    out1 = filt_8tap_dpadd_s_h(tmp4, tmp5, tmp6, src0, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src9 = horiz_8tap_filt(src9, src9, mask0, mask1, mask2, mask3, filt_hz0,
+                           filt_hz1, filt_hz2, filt_hz3);
+    src1 = __lsx_vpackev_b(src9, src8);
+    src3 = filt_8tap_dpadd_s_h(tmp1, tmp2, tmp3, src1, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    src10 = horiz_8tap_filt(src10, src10, mask0, mask1, mask2, mask3, filt_hz0,
+                            filt_hz1, filt_hz2, filt_hz3);
+    src2 = __lsx_vpackev_b(src10, src9);
+    src4 = filt_8tap_dpadd_s_h(tmp5, tmp6, src0, src2, filt_vt0, filt_vt1,
+                               filt_vt2, filt_vt3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, src4, src3, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    src6 = src10;
+    tmp0 = tmp2;
+    tmp1 = tmp3;
+    tmp2 = src1;
+    tmp4 = tmp6;
+    tmp5 = src0;
+    tmp6 = src2;
+  }
+}
+
+static void common_hv_8ht_8vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+
+  common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                           filter_vert, height);
+  src += 8;
+  dst += 8;
+}
+
+static void common_hv_8ht_8vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_8ht_8vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 8; multiple8_cnt--;) {
+    common_hv_8ht_8vt_8w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                             filter_vert, height);
+    src += 8;
+    dst += 8;
+  }
+}
+
+static void common_hv_2ht_2vt_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_vt, filt_hz, vec0, vec1;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, tmp0, tmp1;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  filt_hz = __lsx_vldrepl_h(filter_horiz, 0);
+  filt_vt = __lsx_vldrepl_h(filter_vert, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+
+  hz_out1 = __lsx_vshuf_b(hz_out2, hz_out0, shuff);
+  hz_out3 = __lsx_vpickod_d(hz_out4, hz_out2);
+
+  DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp0, tmp1);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp0, tmp0, FILTER_BITS, tmp1, tmp1,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
+  __m128i hz_out7, hz_out8, vec4, vec5, vec6, vec7;
+  __m128i shuff = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 16);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += src_stride4;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src1, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src2, src3, mask, filt_hz);
+  hz_out4 = horiz_2tap_filt_uh(src4, src5, mask, filt_hz);
+  hz_out6 = horiz_2tap_filt_uh(src6, src7, mask, filt_hz);
+  hz_out8 = horiz_2tap_filt_uh(src8, src8, mask, filt_hz);
+
+  DUP2_ARG3(__lsx_vshuf_b, hz_out2, hz_out0, shuff, hz_out4, hz_out2, shuff,
+            hz_out1, hz_out3);
+  hz_out5 = __lsx_vshuf_b(hz_out6, hz_out4, shuff);
+  hz_out7 = __lsx_vpickod_d(hz_out8, hz_out6);
+  DUP4_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, hz_out5,
+            hz_out4, hz_out7, hz_out6, vec0, vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, vec2, filt_vt, vec3,
+            filt_vt, vec4, vec5, vec6, vec7);
+  DUP4_ARG3(__lsx_vssrarni_bu_h, vec4, vec4, FILTER_BITS, vec5, vec5,
+            FILTER_BITS, vec6, vec6, FILTER_BITS, vec7, vec7, FILTER_BITS, vec4,
+            vec5, vec6, vec7);
+
+  __lsx_vstelm_w(vec4, dst, 0, 0);
+  __lsx_vstelm_w(vec4, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec5, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec5, dst + dst_stride3, 0, 1);
+  dst += dst_stride4;
+  __lsx_vstelm_w(vec6, dst, 0, 0);
+  __lsx_vstelm_w(vec6, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(vec7, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_w(vec7, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_4x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else if (height == 8) {
+    common_hv_2ht_2vt_4x8_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  }
+}
+
+static void common_hv_2ht_2vt_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz,
+                                      int8_t *filter_vert) {
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1, vec2, vec3;
+  __m128i hz_out0, hz_out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+  vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp0 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+  vec1 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp1 = __lsx_vdp2_h_bu(vec1, filt_vt);
+
+  hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+  vec2 = __lsx_vpackev_b(hz_out1, hz_out0);
+  tmp2 = __lsx_vdp2_h_bu(vec2, filt_vt);
+
+  hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+  vec3 = __lsx_vpackev_b(hz_out0, hz_out1);
+  tmp3 = __lsx_vdp2_h_bu(vec3, filt_vt);
+
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_d(tmp0, dst, 0, 0);
+  __lsx_vstelm_d(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(tmp1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(tmp1, dst + dst_stride3, 0, 1);
+}
+
+static void common_hv_2ht_2vt_8x8mult_lsx(const uint8_t *src,
+                                          int32_t src_stride, uint8_t *dst,
+                                          int32_t dst_stride,
+                                          int8_t *filter_horiz,
+                                          int8_t *filter_vert, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, mask;
+  __m128i filt_hz, filt_vt, vec0;
+  __m128i hz_out0, hz_out1, tmp1, tmp2, tmp3, tmp4;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp1 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp2 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out1 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out1, hz_out0);
+    tmp3 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    hz_out0 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    vec0 = __lsx_vpackev_b(hz_out0, hz_out1);
+    tmp4 = __lsx_vdp2_h_bu(vec0, filt_vt);
+
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp2, tmp1, FILTER_BITS, tmp4, tmp3,
+              FILTER_BITS, tmp1, tmp2);
+
+    __lsx_vstelm_d(tmp1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp1, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(tmp2, dst, 0, 1);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter_horiz, int8_t *filter_vert,
+                                     int32_t height) {
+  if (height == 4) {
+    common_hv_2ht_2vt_8x4_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert);
+  } else {
+    common_hv_2ht_2vt_8x8mult_lsx(src, src_stride, dst, dst_stride,
+                                  filter_horiz, filter_vert, height);
+  }
+}
+
+static void common_hv_2ht_2vt_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, mask;
+  __m128i filt_hz, filt_vt, vec0, vec1;
+  __m128i tmp, tmp1, tmp2, hz_out0, hz_out1, hz_out2, hz_out3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  mask = __lsx_vld(mc_filt_mask_arr, 0);
+
+  /* rearranging filter */
+  DUP2_ARG2(__lsx_vldrepl_h, filter_horiz, 0, filter_vert, 0, filt_hz, filt_vt);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 8, src0, src1);
+  src += src_stride;
+
+  hz_out0 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+  hz_out2 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + 8;
+
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp0, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp0, src_stride, src,
+              src_stride2, src_tmp0, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp0, src_stride3, src6, src7);
+    src += src_stride4;
+
+    hz_out1 = horiz_2tap_filt_uh(src0, src0, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src1, src1, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src2, src2, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src3, src3, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out1 = horiz_2tap_filt_uh(src4, src4, mask, filt_hz);
+    hz_out3 = horiz_2tap_filt_uh(src5, src5, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out1, hz_out0, hz_out3, hz_out2, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    hz_out0 = horiz_2tap_filt_uh(src6, src6, mask, filt_hz);
+    hz_out2 = horiz_2tap_filt_uh(src7, src7, mask, filt_hz);
+    DUP2_ARG2(__lsx_vpackev_b, hz_out0, hz_out1, hz_out2, hz_out3, vec0, vec1);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt_vt, vec1, filt_vt, tmp1, tmp2);
+    tmp = __lsx_vssrarni_bu_h(tmp2, tmp1, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+  }
+}
+
+static void common_hv_2ht_2vt_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+  src += 16;
+  dst += 16;
+
+  common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                            filter_vert, height);
+}
+
+static void common_hv_2ht_2vt_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter_horiz, int8_t *filter_vert,
+                                      int32_t height) {
+  int32_t multiple8_cnt;
+  for (multiple8_cnt = 4; multiple8_cnt--;) {
+    common_hv_2ht_2vt_16w_lsx(src, src_stride, dst, dst_stride, filter_horiz,
+                              filter_vert, height);
+    src += 16;
+    dst += 16;
+  }
+}
+
+void vpx_convolve8_lsx(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
+                       ptrdiff_t dst_stride, const InterpKernel *filter,
+                       int x0_q4, int32_t x_step_q4, int y0_q4,
+                       int32_t y_step_q4, int32_t w, int32_t h) {
+  const int16_t *const filter_x = filter[x0_q4];
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_hor[8], filt_ver[8];
+
+  assert(x_step_q4 == 16);
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_x)[1] != 0x800000);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 0; cnt < 8; ++cnt) {
+    filt_hor[cnt] = filter_x[cnt];
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_x) == 2 &&
+      vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_hv_2ht_2vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 8:
+        common_hv_2ht_2vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, &filt_hor[3],
+                                 &filt_ver[3], (int32_t)h);
+        break;
+      case 16:
+        common_hv_2ht_2vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 32:
+        common_hv_2ht_2vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      case 64:
+        common_hv_2ht_2vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, &filt_hor[3],
+                                  &filt_ver[3], (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else if (vpx_get_filter_taps(filter_x) == 2 ||
+             vpx_get_filter_taps(filter_y) == 2) {
+    vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4, x_step_q4,
+                    y0_q4, y_step_q4, w, h);
+  } else {
+    switch (w) {
+      case 4:
+        common_hv_8ht_8vt_4w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 8:
+        common_hv_8ht_8vt_8w_lsx(src, (int32_t)src_stride, dst,
+                                 (int32_t)dst_stride, filt_hor, filt_ver,
+                                 (int32_t)h);
+        break;
+      case 16:
+        common_hv_8ht_8vt_16w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 32:
+        common_hv_8ht_8vt_32w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      case 64:
+        common_hv_8ht_8vt_64w_lsx(src, (int32_t)src_stride, dst,
+                                  (int32_t)dst_stride, filt_hor, filt_ver,
+                                  (int32_t)h);
+        break;
+      default:
+        vpx_convolve8_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                        x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
new file mode 100644
index 0000000..6022e43
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve8_vert_lsx.c
@@ -0,0 +1,825 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <assert.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_dsp/loongarch/vpx_convolve_lsx.h"
+
+static void common_vt_8t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
+  __m128i reg0, reg1, reg2, reg3, reg4;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1;
+  uint8_t *_src = (uint8_t *)src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+  src0 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(_src, src_stride3);
+  _src += src_stride4;
+  src4 = __lsx_vld(_src, 0);
+  DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src5, src6);
+  _src += src_stride3;
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, tmp0,
+            tmp1, tmp2, tmp3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, tmp4, tmp5);
+  DUP2_ARG2(__lsx_vilvl_d, tmp3, tmp0, tmp4, tmp1, reg0, reg1);
+  reg2 = __lsx_vilvl_d(tmp5, tmp2);
+  DUP2_ARG2(__lsx_vxori_b, reg0, 128, reg1, 128, reg0, reg1);
+  reg2 = __lsx_vxori_b(reg2, 128);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(_src, 0);
+    DUP2_ARG2(__lsx_vldx, _src, src_stride, _src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(_src, src_stride3);
+    _src += src_stride4;
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG2(__lsx_vilvl_d, tmp1, tmp0, tmp3, tmp2, reg3, reg4);
+    DUP2_ARG2(__lsx_vxori_b, reg3, 128, reg4, 128, reg3, reg4);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, reg3, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg1, reg2, reg3, reg4, filter0, filter1,
+                               filter2, filter3);
+    out0 = __lsx_vssrarni_b_h(out1, out0, 7);
+    out0 = __lsx_vxori_b(out0, 128);
+    __lsx_vstelm_w(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 2);
+    dst += dst_stride;
+    __lsx_vstelm_w(out0, dst, 0, 3);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = reg3;
+    reg2 = reg4;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i out0, out1, out2, out3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  src = src - src_stride3;
+
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+  src3 = __lsx_vldx(src, src_stride3);
+  src += src_stride4;
+  src4 = __lsx_vld(src, 0);
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              tmp0, tmp1, tmp2, tmp3);
+    out0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, tmp0, filter0, filter1,
+                               filter2, filter3);
+    out1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, tmp1, filter0, filter1,
+                               filter2, filter3);
+    out2 = filt_8tap_dpadd_s_h(reg1, reg2, tmp0, tmp2, filter0, filter1,
+                               filter2, filter3);
+    out3 = filt_8tap_dpadd_s_h(reg4, reg5, tmp1, tmp3, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, out1, out0, 7, out3, out2, 7, out0, out1);
+    DUP2_ARG2(__lsx_vxori_b, out0, 128, out1, 128, out0, out1);
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out0, dst, 0, 1);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(out1, dst, 0, 1);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = tmp0;
+    reg2 = tmp2;
+    reg3 = reg5;
+    reg4 = tmp1;
+    reg5 = tmp3;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = height >> 2;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  // uint8_t *_src = (uint8_t *)src - src_stride3;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+  src += src_stride3;
+
+  DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+            src1, src2, src3);
+  DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+  src6 = __lsx_vxori_b(src6, 128);
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1, reg0,
+            reg1, reg2, reg3);
+  DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+  DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1, reg6,
+            reg7, reg8, reg9);
+  DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+  for (; loop_cnt--;) {
+    src7 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src8, src9);
+    src10 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128, src7,
+              src8, src9, src10);
+    DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src0, src1, src2, src3);
+    DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+              src4, src5, src7, src8);
+    tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+    tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                               filter2, filter3);
+    tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                               filter2, filter3);
+    tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                               filter2, filter3);
+    tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                               filter2, filter3);
+    DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+    DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+    __lsx_vst(tmp0, dst, 0);
+    dst += dst_stride;
+    __lsx_vst(tmp1, dst, 0);
+    dst += dst_stride;
+
+    reg0 = reg2;
+    reg1 = src0;
+    reg2 = src2;
+    reg3 = reg5;
+    reg4 = src1;
+    reg5 = src3;
+    reg6 = reg8;
+    reg7 = src4;
+    reg8 = src7;
+    reg9 = reg11;
+    reg10 = src5;
+    reg11 = src8;
+    src6 = src10;
+  }
+}
+
+static void common_vt_8t_16w_mult_lsx(const uint8_t *src, int32_t src_stride,
+                                      uint8_t *dst, int32_t dst_stride,
+                                      int8_t *filter, int32_t height,
+                                      int32_t width) {
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  uint32_t cnt = width >> 4;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i filter0, filter1, filter2, filter3;
+  __m128i reg0, reg1, reg2, reg3, reg4, reg5;
+  __m128i reg6, reg7, reg8, reg9, reg10, reg11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride + src_stride2;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  src -= src_stride3;
+  DUP4_ARG2(__lsx_vldrepl_h, filter, 0, filter, 2, filter, 4, filter, 6,
+            filter0, filter1, filter2, filter3);
+
+  for (; cnt--;) {
+    uint32_t loop_cnt = height >> 2;
+
+    src_tmp = src;
+    dst_tmp = dst;
+
+    src0 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src1,
+              src2);
+    src3 = __lsx_vldx(src_tmp, src_stride3);
+    src_tmp += src_stride4;
+    src4 = __lsx_vld(src_tmp, 0);
+    DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+              src6);
+    src_tmp += src_stride3;
+
+    DUP4_ARG2(__lsx_vxori_b, src0, 128, src1, 128, src2, 128, src3, 128, src0,
+              src1, src2, src3);
+    DUP2_ARG2(__lsx_vxori_b, src4, 128, src5, 128, src4, src5);
+    src6 = __lsx_vxori_b(src6, 128);
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg0, reg1, reg2, reg3);
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src6, src5, reg4, reg5);
+    DUP4_ARG2(__lsx_vilvh_b, src1, src0, src3, src2, src5, src4, src2, src1,
+              reg6, reg7, reg8, reg9);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src6, src5, reg10, reg11);
+
+    for (; loop_cnt--;) {
+      src7 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src8,
+                src9);
+      src10 = __lsx_vldx(src_tmp, src_stride3);
+      src_tmp += src_stride4;
+      DUP4_ARG2(__lsx_vxori_b, src7, 128, src8, 128, src9, 128, src10, 128,
+                src7, src8, src9, src10);
+      DUP4_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src0, src1, src2, src3);
+      DUP4_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, src9, src8, src10, src9,
+                src4, src5, src7, src8);
+      tmp0 = filt_8tap_dpadd_s_h(reg0, reg1, reg2, src0, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg3, reg4, reg5, src1, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg6, reg7, reg8, src4, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg9, reg10, reg11, src5, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vst(tmp0, dst_tmp, 0);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride);
+      tmp0 = filt_8tap_dpadd_s_h(reg1, reg2, src0, src2, filter0, filter1,
+                                 filter2, filter3);
+      tmp1 = filt_8tap_dpadd_s_h(reg4, reg5, src1, src3, filter0, filter1,
+                                 filter2, filter3);
+      tmp2 = filt_8tap_dpadd_s_h(reg7, reg8, src4, src7, filter0, filter1,
+                                 filter2, filter3);
+      tmp3 = filt_8tap_dpadd_s_h(reg10, reg11, src5, src8, filter0, filter1,
+                                 filter2, filter3);
+      DUP2_ARG3(__lsx_vssrarni_b_h, tmp2, tmp0, 7, tmp3, tmp1, 7, tmp0, tmp1);
+      DUP2_ARG2(__lsx_vxori_b, tmp0, 128, tmp1, 128, tmp0, tmp1);
+      __lsx_vstx(tmp0, dst_tmp, dst_stride2);
+      __lsx_vstx(tmp1, dst_tmp, dst_stride3);
+      dst_tmp += dst_stride4;
+
+      reg0 = reg2;
+      reg1 = src0;
+      reg2 = src2;
+      reg3 = reg5;
+      reg4 = src1;
+      reg5 = src3;
+      reg6 = reg8;
+      reg7 = src4;
+      reg8 = src7;
+      reg9 = reg11;
+      reg10 = src5;
+      reg11 = src8;
+      src6 = src10;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void common_vt_8t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            32);
+}
+
+static void common_vt_8t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  common_vt_8t_16w_mult_lsx(src, src_stride, dst, dst_stride, filter, height,
+                            64);
+}
+
+static void common_vt_2t_4x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i filt0, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP2_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec4, vec5);
+  DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+  tmp0 = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4x8_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5;
+  __m128i vec6, vec7, vec8, vec9, vec10, vec11;
+  __m128i tmp0, tmp1, tmp2, tmp3;
+  __m128i filt0;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride4;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+  src += src_stride4;
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src5, src6, src7, src8);
+  src += (src_stride4 + src_stride);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7, vec4,
+            vec5, vec6, vec7);
+  DUP4_ARG2(__lsx_vilvl_d, vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec8,
+            vec9, vec10, vec11);
+
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec8, filt0, vec9, filt0, vec10, filt0, vec11,
+            filt0, tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, tmp0, tmp1);
+
+  __lsx_vstelm_w(tmp0, dst, 0, 0);
+  __lsx_vstelm_w(tmp0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp0, dst + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp0, dst + dst_stride3, 0, 3);
+
+  __lsx_vstelm_w(tmp1, dst_tmp1, 0, 0);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride, 0, 1);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride2, 0, 2);
+  __lsx_vstelm_w(tmp1, dst_tmp1 + dst_stride3, 0, 3);
+}
+
+static void common_vt_2t_4w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_4x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else if (height == 8) {
+    common_vt_2t_4x8_lsx(src, src_stride, dst, dst_stride, filter);
+  }
+}
+
+static void common_vt_2t_8x4_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter) {
+  __m128i src0, src1, src2, src3, src4, vec0, vec1, vec2, vec3, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+            src, src_stride4, src1, src2, src3, src4);
+
+  DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3, vec0,
+            vec1, vec2, vec3);
+  DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3, filt0,
+            tmp0, tmp1, tmp2, tmp3);
+  DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+            FILTER_BITS, out0, out1);
+
+  __lsx_vstelm_d(out0, dst, 0, 0);
+  __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+  __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+  __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+}
+
+static void common_vt_2t_8x8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                     uint8_t *dst, int32_t dst_stride,
+                                     int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 3);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i out0, out1, tmp0, tmp1, tmp2, tmp3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src5 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src6, src7)
+    src8 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP4_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, src3, src2, src4, src3,
+              vec0, vec1, vec2, vec3);
+    DUP4_ARG2(__lsx_vilvl_b, src5, src4, src6, src5, src7, src6, src8, src7,
+              vec4, vec5, vec6, vec7);
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, vec2, filt0, vec3,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    DUP4_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, vec6, filt0, vec7,
+              filt0, tmp0, tmp1, tmp2, tmp3);
+    DUP2_ARG3(__lsx_vssrarni_bu_h, tmp1, tmp0, FILTER_BITS, tmp3, tmp2,
+              FILTER_BITS, out0, out1);
+
+    __lsx_vstelm_d(out0, dst, 0, 0);
+    __lsx_vstelm_d(out0, dst + dst_stride, 0, 1);
+    __lsx_vstelm_d(out1, dst + dst_stride2, 0, 0);
+    __lsx_vstelm_d(out1, dst + dst_stride3, 0, 1);
+    dst += dst_stride4;
+
+    src0 = src8;
+  }
+}
+
+static void common_vt_2t_8w_lsx(const uint8_t *src, int32_t src_stride,
+                                uint8_t *dst, int32_t dst_stride,
+                                int8_t *filter, int32_t height) {
+  if (height == 4) {
+    common_vt_2t_8x4_lsx(src, src_stride, dst, dst_stride, filter);
+  } else {
+    common_vt_2t_8x8mult_lsx(src, src_stride, dst, dst_stride, filter, height);
+  }
+}
+
+static void common_vt_2t_16w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, tmp, tmp0, tmp1;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  src0 = __lsx_vld(src, 0);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    src1 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src2, src3);
+    src4 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+    dst += dst_stride;
+
+    src0 = src4;
+  }
+}
+
+static void common_vt_2t_32w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 2);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
+  __m128i vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  uint8_t *src_tmp;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP2_ARG2(__lsx_vld, src, 0, src, 16, src0, src5);
+  src += src_stride;
+  src_tmp = src + 16;
+
+  for (; loop_cnt--;) {
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src1, src6);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src7, src3, src8);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src4, src9);
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    src += src_stride4;
+    src_tmp += src_stride4;
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride);
+
+    DUP2_ARG2(__lsx_vilvl_b, src3, src2, src4, src3, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src3, src2, src4, src3, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride2);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vstx(tmp, dst, dst_stride3);
+
+    DUP2_ARG2(__lsx_vilvl_b, src6, src5, src7, src6, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src6, src5, src7, src6, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src8, src7, src9, src8, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src8, src7, src9, src8, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    dst += dst_stride;
+    __lsx_vst(tmp, dst, 16);
+
+    dst += dst_stride;
+
+    src0 = src4;
+    src5 = src9;
+  }
+}
+
+static void common_vt_2t_64w_lsx(const uint8_t *src, int32_t src_stride,
+                                 uint8_t *dst, int32_t dst_stride,
+                                 int8_t *filter, int32_t height) {
+  uint32_t loop_cnt = (height >> 1);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
+  __m128i src11, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, filt0;
+  __m128i tmp, tmp0, tmp1;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t dst_stride2 = dst_stride << 1;
+  uint8_t *dst_tmp1 = dst + dst_stride;
+
+  filt0 = __lsx_vldrepl_h(filter, 0);
+
+  DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src3, src6,
+            src9);
+  src += src_stride;
+
+  for (; loop_cnt--;) {
+    uint8_t *src_tmp0 = src + src_stride;
+
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src1, src4, src7,
+              src10);
+    DUP4_ARG2(__lsx_vld, src_tmp0, 0, src_tmp0, 16, src_tmp0, 32, src_tmp0, 48,
+              src2, src5, src8, src11);
+    src += src_stride2;
+
+    DUP2_ARG2(__lsx_vilvl_b, src1, src0, src2, src1, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src1, src0, src2, src1, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 0);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 0);
+
+    DUP2_ARG2(__lsx_vilvl_b, src4, src3, src5, src4, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src4, src3, src5, src4, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 16);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 16);
+
+    DUP2_ARG2(__lsx_vilvl_b, src7, src6, src8, src7, vec0, vec2);
+    DUP2_ARG2(__lsx_vilvh_b, src7, src6, src8, src7, vec1, vec3);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec0, filt0, vec1, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 32);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec2, filt0, vec3, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 32);
+
+    DUP2_ARG2(__lsx_vilvl_b, src10, src9, src11, src10, vec4, vec6);
+    DUP2_ARG2(__lsx_vilvh_b, src10, src9, src11, src10, vec5, vec7);
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec4, filt0, vec5, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst, 48);
+
+    DUP2_ARG2(__lsx_vdp2_h_bu, vec6, filt0, vec7, filt0, tmp0, tmp1);
+    tmp = __lsx_vssrarni_bu_h(tmp1, tmp0, FILTER_BITS);
+    __lsx_vst(tmp, dst_tmp1, 48);
+    dst += dst_stride2;
+    dst_tmp1 += dst_stride2;
+
+    src0 = src2;
+    src3 = src5;
+    src6 = src8;
+    src9 = src11;
+  }
+}
+
+void vpx_convolve8_vert_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                            uint8_t *dst, ptrdiff_t dst_stride,
+                            const InterpKernel *filter, int x0_q4,
+                            int32_t x_step_q4, int y0_q4, int y_step_q4, int w,
+                            int h) {
+  const int16_t *const filter_y = filter[y0_q4];
+  int8_t cnt, filt_ver[8];
+
+  assert(y_step_q4 == 16);
+  assert(((const int32_t *)filter_y)[1] != 0x800000);
+
+  for (cnt = 8; cnt--;) {
+    filt_ver[cnt] = filter_y[cnt];
+  }
+
+  if (vpx_get_filter_taps(filter_y) == 2) {
+    switch (w) {
+      case 4:
+        common_vt_2t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 8:
+        common_vt_2t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            &filt_ver[3], h);
+        break;
+      case 16:
+        common_vt_2t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 32:
+        common_vt_2t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      case 64:
+        common_vt_2t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             &filt_ver[3], h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  } else {
+    switch (w) {
+      case 4:
+        common_vt_8t_4w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 8:
+        common_vt_8t_8w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                            filt_ver, h);
+        break;
+      case 16:
+        common_vt_8t_16w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 32:
+        common_vt_8t_32w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      case 64:
+        common_vt_8t_64w_lsx(src, (int32_t)src_stride, dst, (int32_t)dst_stride,
+                             filt_ver, h);
+        break;
+      default:
+        vpx_convolve8_vert_c(src, src_stride, dst, dst_stride, filter, x0_q4,
+                             x_step_q4, y0_q4, y_step_q4, w, h);
+        break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
new file mode 100644
index 0000000..1dad29e
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve_avg_lsx.c
@@ -0,0 +1,321 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void avg_width4_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1;
+  __m128i dst0, dst1;
+
+  int32_t src_stride2 = src_stride << 1;
+
+  if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      dst0 = __lsx_vld(dst, 0);
+      dst1 = __lsx_vldx(dst, dst_stride);
+      DUP2_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, dst0, dst1);
+
+      __lsx_vstelm_w(dst0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_w(dst1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void avg_width8_lsx(const uint8_t *src, int32_t src_stride, uint8_t *dst,
+                           int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  __m128i src0, src1, src2, src3;
+  __m128i dst0, dst1, dst2, dst3;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+
+    __lsx_vstelm_d(dst0, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst1, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst2, dst, 0, 0);
+    dst += dst_stride;
+    __lsx_vstelm_d(dst3, dst, 0, 0);
+    dst += dst_stride;
+  }
+}
+
+static void avg_width16_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+    src3 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+    src4 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+    src7 = __lsx_vldx(src, src_stride3);
+    src += src_stride4;
+
+    dst0 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst1, dst2);
+    dst3 = __lsx_vldx(dst, dst_stride3);
+    dst += dst_stride4;
+    dst4 = __lsx_vld(dst, 0);
+    DUP2_ARG2(__lsx_vldx, dst, dst_stride, dst, dst_stride2, dst5, dst6);
+    dst7 = __lsx_vldx(dst, dst_stride3);
+    dst -= dst_stride4;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst1, dst, dst_stride);
+    __lsx_vstx(dst2, dst, dst_stride2);
+    __lsx_vstx(dst3, dst, dst_stride3);
+    dst += dst_stride4;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vstx(dst5, dst, dst_stride);
+    __lsx_vstx(dst6, dst, dst_stride2);
+    __lsx_vstx(dst7, dst, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width32_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 8);
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  int32_t dst_stride2 = dst_stride << 1;
+  int32_t dst_stride3 = dst_stride2 + dst_stride;
+  int32_t dst_stride4 = dst_stride2 << 1;
+
+  for (; cnt--;) {
+    uint8_t *dst_tmp = dst;
+    uint8_t *dst_tmp1 = dst_tmp + 16;
+    uint8_t *src_tmp = src + 16;
+
+    src0 = __lsx_vld(src, 0);
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src0, src1);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src2, src3, src4, src5);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src6, src7);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst0, dst1);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst2, dst3, dst4, dst5);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst6,
+              dst7);
+    dst_tmp += dst_stride4;
+    dst_tmp1 += dst_stride4;
+
+    src_tmp = src + 16;
+    DUP2_ARG2(__lsx_vld, src, 0, src_tmp, 0, src8, src9);
+    DUP4_ARG2(__lsx_vldx, src, src_stride, src_tmp, src_stride, src,
+              src_stride2, src_tmp, src_stride2, src10, src11, src12, src13);
+    DUP2_ARG2(__lsx_vldx, src, src_stride3, src_tmp, src_stride3, src14, src15);
+    src += src_stride4;
+
+    DUP2_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp1, 0, dst8, dst9);
+    DUP4_ARG2(__lsx_vldx, dst_tmp, dst_stride, dst_tmp1, dst_stride, dst_tmp,
+              dst_stride2, dst_tmp1, dst_stride2, dst10, dst11, dst12, dst13);
+    DUP2_ARG2(__lsx_vldx, dst_tmp, dst_stride3, dst_tmp1, dst_stride3, dst14,
+              dst15);
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    dst_tmp = dst + 16;
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vstx(dst2, dst, dst_stride);
+    __lsx_vstx(dst4, dst, dst_stride2);
+    __lsx_vstx(dst6, dst, dst_stride3);
+    __lsx_vst(dst1, dst_tmp, 0);
+    __lsx_vstx(dst3, dst_tmp, dst_stride);
+    __lsx_vstx(dst5, dst_tmp, dst_stride2);
+    __lsx_vstx(dst7, dst_tmp, dst_stride3);
+    dst += dst_stride4;
+
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vstx(dst10, dst, dst_stride);
+    __lsx_vstx(dst12, dst, dst_stride2);
+    __lsx_vstx(dst14, dst, dst_stride3);
+    __lsx_vst(dst9, dst_tmp1, 0);
+    __lsx_vstx(dst11, dst_tmp1, dst_stride);
+    __lsx_vstx(dst13, dst_tmp1, dst_stride2);
+    __lsx_vstx(dst15, dst_tmp1, dst_stride3);
+    dst += dst_stride4;
+  }
+}
+
+static void avg_width64_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt = (height / 4);
+  uint8_t *dst_tmp = dst;
+
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  __m128i src8, src9, src10, src11, src12, src13, src14, src15;
+  __m128i dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
+  __m128i dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
+
+  for (; cnt--;) {
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src0, src1, src2,
+              src3);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src4, src5, src6,
+              src7);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src8, src9, src10,
+              src11);
+    src += src_stride;
+    DUP4_ARG2(__lsx_vld, src, 0, src, 16, src, 32, src, 48, src12, src13, src14,
+              src15);
+    src += src_stride;
+
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst0, dst1, dst2, dst3);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst4, dst5, dst6, dst7);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst8, dst9, dst10, dst11);
+    dst_tmp += dst_stride;
+    DUP4_ARG2(__lsx_vld, dst_tmp, 0, dst_tmp, 16, dst_tmp, 32, dst_tmp, 48,
+              dst12, dst13, dst14, dst15);
+    dst_tmp += dst_stride;
+
+    DUP4_ARG2(__lsx_vavgr_bu, src0, dst0, src1, dst1, src2, dst2, src3, dst3,
+              dst0, dst1, dst2, dst3);
+    DUP4_ARG2(__lsx_vavgr_bu, src4, dst4, src5, dst5, src6, dst6, src7, dst7,
+              dst4, dst5, dst6, dst7);
+    DUP4_ARG2(__lsx_vavgr_bu, src8, dst8, src9, dst9, src10, dst10, src11,
+              dst11, dst8, dst9, dst10, dst11);
+    DUP4_ARG2(__lsx_vavgr_bu, src12, dst12, src13, dst13, src14, dst14, src15,
+              dst15, dst12, dst13, dst14, dst15);
+
+    __lsx_vst(dst0, dst, 0);
+    __lsx_vst(dst1, dst, 16);
+    __lsx_vst(dst2, dst, 32);
+    __lsx_vst(dst3, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst4, dst, 0);
+    __lsx_vst(dst5, dst, 16);
+    __lsx_vst(dst6, dst, 32);
+    __lsx_vst(dst7, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst8, dst, 0);
+    __lsx_vst(dst9, dst, 16);
+    __lsx_vst(dst10, dst, 32);
+    __lsx_vst(dst11, dst, 48);
+    dst += dst_stride;
+    __lsx_vst(dst12, dst, 0);
+    __lsx_vst(dst13, dst, 16);
+    __lsx_vst(dst14, dst, 32);
+    __lsx_vst(dst15, dst, 48);
+    dst += dst_stride;
+  }
+}
+
+void vpx_convolve_avg_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                          uint8_t *dst, ptrdiff_t dst_stride,
+                          const InterpKernel *filter, int x0_q4,
+                          int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                          int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+  switch (w) {
+    case 4: {
+      avg_width4_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+
+    case 8: {
+      avg_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      avg_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      avg_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      avg_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      int32_t lp, cnt;
+      for (cnt = h; cnt--;) {
+        for (lp = 0; lp < w; ++lp) {
+          dst[lp] = (((dst[lp] + src[lp]) + 1) >> 1);
+        }
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c b/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
new file mode 100644
index 0000000..53dc709
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve_copy_lsx.c
@@ -0,0 +1,437 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include <string.h>
+#include "./vpx_dsp_rtcd.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static void copy_width8_lsx(const uint8_t *src, int32_t src_stride,
+                            uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    for (cnt = height >> 3; cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+
+      __lsx_vstelm_d(src4, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src5, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src6, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src7, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 4) == 0) {
+    for (cnt = (height / 4); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src2, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src3, dst, 0, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 2) == 0) {
+    for (cnt = (height / 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      src1 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vstelm_d(src0, dst, 0, 0);
+      dst += dst_stride;
+      __lsx_vstelm_d(src1, dst, 0, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_16multx8mult_lsx(const uint8_t *src, int32_t src_stride,
+                                  uint8_t *dst, int32_t dst_stride,
+                                  int32_t height, int32_t width) {
+  int32_t cnt, loop_cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  for (cnt = (width >> 4); cnt--;) {
+    src_tmp = (uint8_t *)src;
+    dst_tmp = dst;
+
+    for (loop_cnt = (height >> 3); loop_cnt--;) {
+      src0 = __lsx_vld(src_tmp, 0);
+      DUP4_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src_tmp,
+                src_stride3, src_tmp, src_stride4, src1, src2, src3, src4);
+      src_tmp += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src_tmp += src_stride2;
+      src7 = __lsx_vldx(src_tmp, src_stride);
+      src_tmp += src_stride2;
+
+      __lsx_vst(src0, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src1, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src2, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src3, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+    src += 16;
+    dst += 16;
+  }
+}
+
+static void copy_width16_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP4_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src, src_stride3,
+                src, src_stride4, src1, src2, src3, src4);
+      src += src_stride4;
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src5, src6);
+      src += src_stride2;
+      src7 = __lsx_vldx(src, src_stride);
+      src += src_stride2;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src4, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src5, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src6, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src7, dst, 0);
+      dst += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 16);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+    }
+  }
+}
+
+static void copy_width32_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  int32_t cnt;
+  uint8_t *src_tmp;
+  uint8_t *dst_tmp;
+  __m128i src0, src1, src2, src3, src4, src5, src6, src7;
+  int32_t src_stride2 = src_stride << 1;
+  int32_t src_stride3 = src_stride2 + src_stride;
+  int32_t src_stride4 = src_stride2 << 1;
+
+  if ((height % 12) == 0) {
+    for (cnt = (height / 12); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  } else if ((height % 8) == 0) {
+    copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 32);
+  } else if ((height % 4) == 0) {
+    for (cnt = (height >> 2); cnt--;) {
+      src0 = __lsx_vld(src, 0);
+      DUP2_ARG2(__lsx_vldx, src, src_stride, src, src_stride2, src1, src2);
+      src3 = __lsx_vldx(src, src_stride3);
+
+      src_tmp = (uint8_t *)src + 16;
+      src4 = __lsx_vld(src_tmp, 0);
+      DUP2_ARG2(__lsx_vldx, src_tmp, src_stride, src_tmp, src_stride2, src5,
+                src6);
+      src7 = __lsx_vldx(src_tmp, src_stride3);
+      src += src_stride4;
+
+      __lsx_vst(src0, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src1, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src2, dst, 0);
+      dst += dst_stride;
+      __lsx_vst(src3, dst, 0);
+      dst += dst_stride;
+
+      dst_tmp = dst + 16;
+      __lsx_vst(src4, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src5, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src6, dst_tmp, 0);
+      dst_tmp += dst_stride;
+      __lsx_vst(src7, dst_tmp, 0);
+      dst_tmp += dst_stride;
+    }
+  }
+}
+
+static void copy_width64_lsx(const uint8_t *src, int32_t src_stride,
+                             uint8_t *dst, int32_t dst_stride, int32_t height) {
+  copy_16multx8mult_lsx(src, src_stride, dst, dst_stride, height, 64);
+}
+
+void vpx_convolve_copy_lsx(const uint8_t *src, ptrdiff_t src_stride,
+                           uint8_t *dst, ptrdiff_t dst_stride,
+                           const InterpKernel *filter, int x0_q4,
+                           int32_t x_step_q4, int y0_q4, int32_t y_step_q4,
+                           int32_t w, int32_t h) {
+  (void)filter;
+  (void)x0_q4;
+  (void)x_step_q4;
+  (void)y0_q4;
+  (void)y_step_q4;
+
+  switch (w) {
+    case 4: {
+      uint32_t cnt;
+      __m128i tmp;
+      for (cnt = h; cnt--;) {
+        tmp = __lsx_vldrepl_w(src, 0);
+        __lsx_vstelm_w(tmp, dst, 0, 0);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+    case 8: {
+      copy_width8_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 16: {
+      copy_width16_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 32: {
+      copy_width32_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    case 64: {
+      copy_width64_lsx(src, src_stride, dst, dst_stride, h);
+      break;
+    }
+    default: {
+      uint32_t cnt;
+      for (cnt = h; cnt--;) {
+        memcpy(dst, src, w);
+        src += src_stride;
+        dst += dst_stride;
+      }
+      break;
+    }
+  }
+}
diff --git a/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h b/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
new file mode 100644
index 0000000..d886b00
--- /dev/null
+++ b/libvpx/vpx_dsp/loongarch/vpx_convolve_lsx.h
@@ -0,0 +1,138 @@
+/*
+ *  Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ *  Use of this source code is governed by a BSD-style license
+ *  that can be found in the LICENSE file in the root of the source
+ *  tree. An additional intellectual property rights grant can be found
+ *  in the file PATENTS.  All contributing project authors may
+ *  be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+#define VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
+
+#include "./vpx_config.h"
+#include "vpx_dsp/vpx_filter.h"
+#include "vpx_util/loongson_intrinsics.h"
+
+static INLINE __m128i filt_8tap_dpadd_s_h(__m128i _reg0, __m128i _reg1,
+                                          __m128i _reg2, __m128i _reg3,
+                                          __m128i _filter0, __m128i _filter1,
+                                          __m128i _filter2, __m128i _filter3) {
+  __m128i _vec0, _vec1;
+
+  _vec0 = __lsx_vdp2_h_b(_reg0, _filter0);
+  _vec0 = __lsx_vdp2add_h_b(_vec0, _reg1, _filter1);
+  _vec1 = __lsx_vdp2_h_b(_reg2, _filter2);
+  _vec1 = __lsx_vdp2add_h_b(_vec1, _reg3, _filter3);
+  return __lsx_vsadd_h(_vec0, _vec1);
+}
+
+static INLINE __m128i horiz_8tap_filt(__m128i _src0, __m128i _src1,
+                                      __m128i _mask0, __m128i _mask1,
+                                      __m128i _mask2, __m128i _mask3,
+                                      __m128i _filt_h0, __m128i _filt_h1,
+                                      __m128i _filt_h2, __m128i _filt_h3) {
+  __m128i _tmp0, _tmp1, _tmp2, _tmp3;
+  __m128i _out;
+
+  DUP4_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src1, _src0, _mask1, _src1,
+            _src0, _mask2, _src1, _src0, _mask3, _tmp0, _tmp1, _tmp2, _tmp3);
+  _out = filt_8tap_dpadd_s_h(_tmp0, _tmp1, _tmp2, _tmp3, _filt_h0, _filt_h1,
+                             _filt_h2, _filt_h3);
+  _out = __lsx_vsrari_h(_out, FILTER_BITS);
+  return __lsx_vsat_h(_out, 7);
+}
+
+static INLINE __m128i horiz_2tap_filt_uh(__m128i in0, __m128i in1, __m128i mask,
+                                         __m128i coeff) {
+  __m128i tmp0_m, tmp1_m;
+
+  tmp0_m = __lsx_vshuf_b(in1, in0, mask);
+  tmp1_m = __lsx_vdp2_h_bu(tmp0_m, coeff);
+  return __lsx_vsrari_h(tmp1_m, FILTER_BITS);
+}
+
+#define LSX_LD_4(_src, _stride, _src0, _src1, _src2, _src3) \
+  do {                                                      \
+    _src0 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src1 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src2 = __lsx_vld(_src, 0);                             \
+    _src += _stride;                                        \
+    _src3 = __lsx_vld(_src, 0);                             \
+  } while (0)
+
+#define HORIZ_8TAP_4WID_4VECS_FILT(_src0, _src1, _src2, _src3, _mask0, _mask1, \
+                                   _mask2, _mask3, _filter0, _filter1,         \
+                                   _filter2, _filter3, _out0, _out1)           \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3;                                        \
+                                                                               \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask0, _src3, _src2, _mask0,       \
+              _tmp0, _tmp1);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _reg0, _reg1); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask1, _src3, _src2, _mask1,       \
+              _tmp2, _tmp3);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp2, _filter1, _reg1, _tmp3,         \
+              _filter1, _reg0, _reg1);                                         \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask2, _src3, _src2, _mask2,       \
+              _tmp4, _tmp5);                                                   \
+    DUP2_ARG2(__lsx_vdp2_h_b, _tmp4, _filter2, _tmp5, _filter2, _reg2, _reg3); \
+    DUP2_ARG3(__lsx_vshuf_b, _src1, _src0, _mask3, _src3, _src2, _mask3,       \
+              _tmp6, _tmp7);                                                   \
+    DUP2_ARG3(__lsx_vdp2add_h_b, _reg2, _tmp6, _filter3, _reg3, _tmp7,         \
+              _filter3, _reg2, _reg3);                                         \
+    DUP2_ARG2(__lsx_vsadd_h, _reg0, _reg2, _reg1, _reg3, _out0, _out1);        \
+  } while (0)
+
+#define HORIZ_8TAP_8WID_4VECS_FILT(                                            \
+    _src0, _src1, _src2, _src3, _mask0, _mask1, _mask2, _mask3, _filter0,      \
+    _filter1, _filter2, _filter3, _out0, _out1, _out2, _out3)                  \
+  do {                                                                         \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;            \
+    __m128i _reg0, _reg1, _reg2, _reg3, _reg4, _reg5, _reg6, _reg7;            \
+                                                                               \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask0, _src1, _src1, _mask0,       \
+              _src2, _src2, _mask0, _src3, _src3, _mask0, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter0, _tmp1, _filter0, _tmp2,         \
+              _filter0, _tmp3, _filter0, _reg0, _reg1, _reg2, _reg3);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask2, _src1, _src1, _mask2,       \
+              _src2, _src2, _mask2, _src3, _src3, _mask2, _tmp0, _tmp1, _tmp2, \
+              _tmp3);                                                          \
+    DUP4_ARG2(__lsx_vdp2_h_b, _tmp0, _filter2, _tmp1, _filter2, _tmp2,         \
+              _filter2, _tmp3, _filter2, _reg4, _reg5, _reg6, _reg7);          \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask1, _src1, _src1, _mask1,       \
+              _src2, _src2, _mask1, _src3, _src3, _mask1, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg0, _tmp4, _filter1, _reg1, _tmp5,         \
+              _filter1, _reg2, _tmp6, _filter1, _reg3, _tmp7, _filter1, _reg0, \
+              _reg1, _reg2, _reg3);                                            \
+    DUP4_ARG3(__lsx_vshuf_b, _src0, _src0, _mask3, _src1, _src1, _mask3,       \
+              _src2, _src2, _mask3, _src3, _src3, _mask3, _tmp4, _tmp5, _tmp6, \
+              _tmp7);                                                          \
+    DUP4_ARG3(__lsx_vdp2add_h_b, _reg4, _tmp4, _filter3, _reg5, _tmp5,         \
+              _filter3, _reg6, _tmp6, _filter3, _reg7, _tmp7, _filter3, _reg4, \
+              _reg5, _reg6, _reg7);                                            \
+    DUP4_ARG2(__lsx_vsadd_h, _reg0, _reg4, _reg1, _reg5, _reg2, _reg6, _reg3,  \
+              _reg7, _out0, _out1, _out2, _out3);                              \
+  } while (0)
+
+#define AVG_ST4_D(in0, in1, dst0, dst1, pdst, stride)                \
+  do {                                                               \
+    __m128i tmp0_m, tmp1_m;                                          \
+                                                                     \
+    DUP2_ARG2(__lsx_vavgr_bu, in0, dst0, in1, dst1, tmp0_m, tmp1_m); \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp0_m, pdst, 0, 1);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 0);                              \
+    pdst += stride;                                                  \
+    __lsx_vstelm_d(tmp1_m, pdst, 0, 1);                              \
+  } while (0)
+
+#endif  // VPX_VPX_DSP_LOONGARCH_VPX_CONVOLVE_LSX_H_
diff --git a/libvpx/vpx_dsp/mips/sad_mmi.c b/libvpx/vpx_dsp/mips/sad_mmi.c
index eaca477..7f5882b 100644
--- a/libvpx/vpx_dsp/mips/sad_mmi.c
+++ b/libvpx/vpx_dsp/mips/sad_mmi.c
@@ -334,19 +334,6 @@
   "paddw      %[ftmp3],   %[ftmp3],       %[ftmp1]            \n\t"
 #endif /* _MIPS_SIM == _ABIO32 */
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK_mmi(m, n, k)                                                 \
-  void vpx_sad##m##x##n##x##k##_mmi(const uint8_t *src, int src_stride,       \
-                                    const uint8_t *ref_array, int ref_stride, \
-                                    uint32_t *sad_array) {                    \
-    int i;                                                                    \
-    for (i = 0; i < (k); ++i)                                                 \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_mmi(src, src_stride, &ref_array[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
 #define sadMxNx4D_mmi(m, n)                                                  \
   void vpx_sad##m##x##n##x4d_mmi(const uint8_t *src, int src_stride,         \
                                  const uint8_t *const ref_array[],           \
@@ -583,10 +570,6 @@
 vpx_sad16xN(32);
 vpx_sad16xN(16);
 vpx_sad16xN(8);
-sadMxNxK_mmi(16, 16, 3);
-sadMxNxK_mmi(16, 16, 8);
-sadMxNxK_mmi(16, 8, 3);
-sadMxNxK_mmi(16, 8, 8);
 sadMxNx4D_mmi(16, 32);
 sadMxNx4D_mmi(16, 16);
 sadMxNx4D_mmi(16, 8);
@@ -681,10 +664,6 @@
 vpx_sad8xN(16);
 vpx_sad8xN(8);
 vpx_sad8xN(4);
-sadMxNxK_mmi(8, 16, 3);
-sadMxNxK_mmi(8, 16, 8);
-sadMxNxK_mmi(8, 8, 3);
-sadMxNxK_mmi(8, 8, 8);
 sadMxNx4D_mmi(8, 16);
 sadMxNx4D_mmi(8, 8);
 sadMxNx4D_mmi(8, 4);
@@ -777,8 +756,6 @@
 
 vpx_sad4xN(8);
 vpx_sad4xN(4);
-sadMxNxK_mmi(4, 4, 3);
-sadMxNxK_mmi(4, 4, 8);
 sadMxNx4D_mmi(4, 8);
 sadMxNx4D_mmi(4, 4);
 
diff --git a/libvpx/vpx_dsp/mips/sad_msa.c b/libvpx/vpx_dsp/mips/sad_msa.c
index ab681ae..b0f8ff1 100644
--- a/libvpx/vpx_dsp/mips/sad_msa.c
+++ b/libvpx/vpx_dsp/mips/sad_msa.c
@@ -159,380 +159,6 @@
   return sad;
 }
 
-static void sad_4width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    src_ptr += (4 * src_stride);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_8width_x3_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_16width_x3_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref, ref0, ref1, diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-}
-
-static void sad_4width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                              const uint8_t *ref_ptr, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  uint32_t src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref2, ref3, diff;
-  v16u8 src = { 0 };
-  v16u8 ref = { 0 };
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LW4(src_ptr, src_stride, src0, src1, src2, src3);
-    INSERT_W4_UB(src0, src1, src2, src3, src);
-    src_ptr += (4 * src_stride);
-    LD_UB4(ref_ptr, ref_stride, ref0, ref1, ref2, ref3);
-    ref_ptr += (4 * ref_stride);
-
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    SLDI_B2_UB(ref0, ref1, ref0, ref1, ref0, ref1, 1);
-    SLDI_B2_UB(ref2, ref3, ref2, ref3, ref2, ref3, 1);
-    SAD_INSVE_W4_UB(ref0, ref1, ref2, ref3, ref);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_8width_x8_msa(const uint8_t *src, int32_t src_stride,
-                              const uint8_t *ref, int32_t ref_stride,
-                              int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src0, src1, src2, src3;
-  v16u8 ref0, ref1, ref00, ref11, ref22, ref33;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 2); ht_cnt--;) {
-    LD_UB4(src, src_stride, src0, src1, src2, src3);
-    src += (4 * src_stride);
-    LD_UB4(ref, ref_stride, ref00, ref11, ref22, ref33);
-    ref += (4 * ref_stride);
-    PCKEV_D4_UB(src1, src0, src3, src2, ref11, ref00, ref33, ref22, src0, src1,
-                ref0, ref1);
-    sad0 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad1 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad2 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad3 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad4 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad5 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad6 += SAD_UB2_UH(src0, src1, ref0, ref1);
-
-    SLDI_B2_UB(ref00, ref11, ref00, ref11, ref00, ref11, 1);
-    SLDI_B2_UB(ref22, ref33, ref22, ref33, ref22, ref33, 1);
-    PCKEV_D2_UB(ref11, ref00, ref33, ref22, ref0, ref1);
-    sad7 += SAD_UB2_UH(src0, src1, ref0, ref1);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
-static void sad_16width_x8_msa(const uint8_t *src_ptr, int32_t src_stride,
-                               const uint8_t *ref_ptr, int32_t ref_stride,
-                               int32_t height, uint32_t *sad_array) {
-  int32_t ht_cnt;
-  v16u8 src, ref0, ref1, ref;
-  v16u8 diff;
-  v8u16 sad0 = { 0 };
-  v8u16 sad1 = { 0 };
-  v8u16 sad2 = { 0 };
-  v8u16 sad3 = { 0 };
-  v8u16 sad4 = { 0 };
-  v8u16 sad5 = { 0 };
-  v8u16 sad6 = { 0 };
-  v8u16 sad7 = { 0 };
-
-  for (ht_cnt = (height >> 1); ht_cnt--;) {
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-
-    src = LD_UB(src_ptr);
-    src_ptr += src_stride;
-    LD_UB2(ref_ptr, 16, ref0, ref1);
-    ref_ptr += ref_stride;
-
-    diff = __msa_asub_u_b(src, ref0);
-    sad0 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 1);
-    diff = __msa_asub_u_b(src, ref);
-    sad1 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 2);
-    diff = __msa_asub_u_b(src, ref);
-    sad2 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 3);
-    diff = __msa_asub_u_b(src, ref);
-    sad3 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 4);
-    diff = __msa_asub_u_b(src, ref);
-    sad4 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 5);
-    diff = __msa_asub_u_b(src, ref);
-    sad5 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 6);
-    diff = __msa_asub_u_b(src, ref);
-    sad6 += __msa_hadd_u_h(diff, diff);
-
-    ref = (v16u8)__msa_sldi_b((v16i8)ref1, (v16i8)ref0, 7);
-    diff = __msa_asub_u_b(src, ref);
-    sad7 += __msa_hadd_u_h(diff, diff);
-  }
-
-  sad_array[0] = HADD_UH_U32(sad0);
-  sad_array[1] = HADD_UH_U32(sad1);
-  sad_array[2] = HADD_UH_U32(sad2);
-  sad_array[3] = HADD_UH_U32(sad3);
-  sad_array[4] = HADD_UH_U32(sad4);
-  sad_array[5] = HADD_UH_U32(sad5);
-  sad_array[6] = HADD_UH_U32(sad6);
-  sad_array[7] = HADD_UH_U32(sad7);
-}
-
 static void sad_4width_x4d_msa(const uint8_t *src_ptr, int32_t src_stride,
                                const uint8_t *const aref_ptr[],
                                int32_t ref_stride, int32_t height,
@@ -1037,80 +663,38 @@
     return sad_64width_msa(src, src_stride, ref, ref_stride, height);         \
   }
 
-#define VPX_SAD_4xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad4x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad8x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx3_MSA(height)                                   \
-  void vpx_sad16x##height##x3_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x3_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_4xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad4x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_4width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_8xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad8x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                 const uint8_t *ref, int32_t ref_stride, \
-                                 uint32_t *sads) {                       \
-    sad_8width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
-#define VPX_SAD_16xHEIGHTx8_MSA(height)                                   \
-  void vpx_sad16x##height##x8_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *ref, int32_t ref_stride, \
-                                  uint32_t *sads) {                       \
-    sad_16width_x8_msa(src, src_stride, ref, ref_stride, height, sads);   \
-  }
-
 #define VPX_SAD_4xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad4x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_4width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_8xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad8x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                  const uint8_t *const refs[],            \
-                                  int32_t ref_stride, uint32_t *sads) {   \
+                                  const uint8_t *const refs[4],           \
+                                  int32_t ref_stride, uint32_t sads[4]) { \
     sad_8width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_16xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad16x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_16width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_32xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad32x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_32width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
 #define VPX_SAD_64xHEIGHTx4D_MSA(height)                                   \
   void vpx_sad64x##height##x4d_msa(const uint8_t *src, int32_t src_stride, \
-                                   const uint8_t *const refs[],            \
-                                   int32_t ref_stride, uint32_t *sads) {   \
+                                   const uint8_t *const refs[4],           \
+                                   int32_t ref_stride, uint32_t sads[4]) { \
     sad_64width_x4d_msa(src, src_stride, refs, ref_stride, height, sads);  \
   }
 
@@ -1186,29 +770,21 @@
 
 // 16x16
 VPX_SAD_16xHEIGHT_MSA(16);
-VPX_SAD_16xHEIGHTx3_MSA(16);
-VPX_SAD_16xHEIGHTx8_MSA(16);
 VPX_SAD_16xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_16xHEIGHT_MSA(16);
 
 // 16x8
 VPX_SAD_16xHEIGHT_MSA(8);
-VPX_SAD_16xHEIGHTx3_MSA(8);
-VPX_SAD_16xHEIGHTx8_MSA(8);
 VPX_SAD_16xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_16xHEIGHT_MSA(8);
 
 // 8x16
 VPX_SAD_8xHEIGHT_MSA(16);
-VPX_SAD_8xHEIGHTx3_MSA(16);
-VPX_SAD_8xHEIGHTx8_MSA(16);
 VPX_SAD_8xHEIGHTx4D_MSA(16);
 VPX_AVGSAD_8xHEIGHT_MSA(16);
 
 // 8x8
 VPX_SAD_8xHEIGHT_MSA(8);
-VPX_SAD_8xHEIGHTx3_MSA(8);
-VPX_SAD_8xHEIGHTx8_MSA(8);
 VPX_SAD_8xHEIGHTx4D_MSA(8);
 VPX_AVGSAD_8xHEIGHT_MSA(8);
 
@@ -1224,7 +800,5 @@
 
 // 4x4
 VPX_SAD_4xHEIGHT_MSA(4);
-VPX_SAD_4xHEIGHTx3_MSA(4);
-VPX_SAD_4xHEIGHTx8_MSA(4);
 VPX_SAD_4xHEIGHTx4D_MSA(4);
 VPX_AVGSAD_4xHEIGHT_MSA(4);
diff --git a/libvpx/vpx_dsp/ppc/quantize_vsx.c b/libvpx/vpx_dsp/ppc/quantize_vsx.c
index d85e63b..7cdcbeb 100644
--- a/libvpx/vpx_dsp/ppc/quantize_vsx.c
+++ b/libvpx/vpx_dsp/ppc/quantize_vsx.c
@@ -95,8 +95,8 @@
 }
 
 void vpx_quantize_b_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan_ptr,
@@ -122,8 +122,6 @@
   zero_mask1 = vec_cmpge(coeff1_abs, zbin);
 
   (void)scan_ptr;
-  (void)skip_block;
-  assert(!skip_block);
 
   qcoeff0 =
       quantize_coeff(coeff0, coeff0_abs, round, quant, quant_shift, zero_mask0);
@@ -196,12 +194,14 @@
   *eob_ptr = eob[0];
 }
 
-void vpx_quantize_b_32x32_vsx(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
-    const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-    tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
-    const int16_t *scan_ptr, const int16_t *iscan_ptr) {
+void vpx_quantize_b_32x32_vsx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
+                              const int16_t *quant_ptr,
+                              const int16_t *quant_shift_ptr,
+                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                              const int16_t *scan_ptr,
+                              const int16_t *iscan_ptr) {
   // In stage 1, we quantize 16 coeffs (DC + 15 AC)
   // In stage 2, we loop 42 times and quantize 24 coeffs per iteration
   // (32 * 32 - 16) / 24 = 42
@@ -227,9 +227,7 @@
   int16x8_t coeff1_abs = vec_abs(coeff1);
 
   (void)scan_ptr;
-  (void)skip_block;
   (void)n_coeffs;
-  assert(!skip_block);
 
   // 32x32 quantization requires that zbin and round be divided by 2
   zbin = vec_sra(vec_add(zbin, vec_ones_s16), vec_ones_u16);
diff --git a/libvpx/vpx_dsp/quantize.c b/libvpx/vpx_dsp/quantize.c
index 61818f6..5d6ba64 100644
--- a/libvpx/vpx_dsp/quantize.c
+++ b/libvpx/vpx_dsp/quantize.c
@@ -15,7 +15,7 @@
 #include "vpx_dsp/vpx_dsp_common.h"
 #include "vpx_mem/vpx_mem.h"
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr) {
@@ -28,28 +28,26 @@
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 16;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + round_ptr[rc != 0], INT16_MIN, INT16_MAX);
+  tmp = (tmp * quant) >> 16;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr) {
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr) {
   int eob = -1;
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -59,11 +57,12 @@
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr) {
@@ -77,19 +76,18 @@
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
-    tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1),
-                INT16_MIN, INT16_MAX);
-    tmp = (tmp * quant) >> 15;
-    qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
-    dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
-    if (tmp) eob = 0;
-  }
+  tmp = clamp(abs_coeff + ROUND_POWER_OF_TWO(round_ptr[rc != 0], 1), INT16_MIN,
+              INT16_MAX);
+  tmp = (tmp * quant) >> 15;
+  qcoeff_ptr[rc] = (tmp ^ coeff_sign) - coeff_sign;
+  dqcoeff_ptr[rc] = qcoeff_ptr[rc] * dequant / 2;
+  if (tmp) eob = 0;
+
   *eob_ptr = eob + 1;
 }
 
 #if CONFIG_VP9_HIGHBITDEPTH
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
@@ -100,7 +98,7 @@
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
 
-  if (!skip_block) {
+  {
     const int coeff = coeff_ptr[0];
     const int coeff_sign = (coeff >> 31);
     const int abs_coeff = (coeff ^ coeff_sign) - coeff_sign;
@@ -110,23 +108,21 @@
     dqcoeff_ptr[0] = qcoeff_ptr[0] * dequant / 2;
     if (abs_qcoeff) eob = 0;
   }
+
   *eob_ptr = eob + 1;
 }
 #endif
 
 void vpx_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                      int skip_block, const int16_t *zbin_ptr,
-                      const int16_t *round_ptr, const int16_t *quant_ptr,
-                      const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
-                      tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
-                      uint16_t *eob_ptr, const int16_t *scan,
-                      const int16_t *iscan) {
+                      const int16_t *zbin_ptr, const int16_t *round_ptr,
+                      const int16_t *quant_ptr, const int16_t *quant_shift_ptr,
+                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                      const int16_t *dequant_ptr, uint16_t *eob_ptr,
+                      const int16_t *scan, const int16_t *iscan) {
   int i, non_zero_count = (int)n_coeffs, eob = -1;
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -166,8 +162,8 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                             int skip_block, const int16_t *zbin_ptr,
-                             const int16_t *round_ptr, const int16_t *quant_ptr,
+                             const int16_t *zbin_ptr, const int16_t *round_ptr,
+                             const int16_t *quant_ptr,
                              const int16_t *quant_shift_ptr,
                              tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                              const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -176,8 +172,6 @@
   const int zbins[2] = { zbin_ptr[0], zbin_ptr[1] };
   const int nzbins[2] = { zbins[0] * -1, zbins[1] * -1 };
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -215,8 +209,8 @@
 #endif
 
 void vpx_quantize_b_32x32_c(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                            int skip_block, const int16_t *zbin_ptr,
-                            const int16_t *round_ptr, const int16_t *quant_ptr,
+                            const int16_t *zbin_ptr, const int16_t *round_ptr,
+                            const int16_t *quant_ptr,
                             const int16_t *quant_shift_ptr,
                             tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                             const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -229,8 +223,6 @@
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
@@ -277,8 +269,8 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_32x32_c(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -290,8 +282,6 @@
   int idx_arr[1024];
   int i, eob = -1;
   (void)iscan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, n_coeffs * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, n_coeffs * sizeof(*dqcoeff_ptr));
diff --git a/libvpx/vpx_dsp/quantize.h b/libvpx/vpx_dsp/quantize.h
index 7cac140..8e13844 100644
--- a/libvpx/vpx_dsp/quantize.h
+++ b/libvpx/vpx_dsp/quantize.h
@@ -18,22 +18,21 @@
 extern "C" {
 #endif
 
-void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs, int skip_block,
+void vpx_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
                      const int16_t *round_ptr, const int16_t quant,
                      tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                      const int16_t dequant, uint16_t *eob_ptr);
-void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+void vpx_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                            const int16_t *round_ptr, const int16_t quant,
                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                            const int16_t dequant, uint16_t *eob_ptr);
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_dc(const tran_low_t *coeff_ptr, int n_coeffs,
-                            int skip_block, const int16_t *round_ptr,
-                            const int16_t quant, tran_low_t *qcoeff_ptr,
-                            tran_low_t *dqcoeff_ptr, const int16_t dequant,
-                            uint16_t *eob_ptr);
-void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr, int skip_block,
+                            const int16_t *round_ptr, const int16_t quant,
+                            tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
+                            const int16_t dequant, uint16_t *eob_ptr);
+void vpx_highbd_quantize_dc_32x32(const tran_low_t *coeff_ptr,
                                   const int16_t *round_ptr, const int16_t quant,
                                   tran_low_t *qcoeff_ptr,
                                   tran_low_t *dqcoeff_ptr,
diff --git a/libvpx/vpx_dsp/sad.c b/libvpx/vpx_dsp/sad.c
index 7693220..b47c434 100644
--- a/libvpx/vpx_dsp/sad.c
+++ b/libvpx/vpx_dsp/sad.c
@@ -45,23 +45,11 @@
     return sad(src_ptr, src_stride, comp_pred, m, m, n);                      \
   }
 
-// depending on call sites, pass **ref_array to avoid & in subsequent call and
-// de-dup with 4D below.
-#define sadMxNxK(m, n, k)                                                     \
-  void vpx_sad##m##x##n##x##k##_c(const uint8_t *src_ptr, int src_stride,     \
-                                  const uint8_t *ref_ptr, int ref_stride,     \
-                                  uint32_t *sad_array) {                      \
-    int i;                                                                    \
-    for (i = 0; i < k; ++i)                                                   \
-      sad_array[i] =                                                          \
-          vpx_sad##m##x##n##_c(src_ptr, src_stride, &ref_ptr[i], ref_stride); \
-  }
-
-// This appears to be equivalent to the above when k == 4 and refs is const
+// Compare |src_ptr| to 4 distinct references in |ref_array[4]|
 #define sadMxNx4D(m, n)                                                        \
   void vpx_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,         \
-                               const uint8_t *const ref_array[],               \
-                               int ref_stride, uint32_t *sad_array) {          \
+                               const uint8_t *const ref_array[4],              \
+                               int ref_stride, uint32_t sad_array[4]) {        \
     int i;                                                                     \
     for (i = 0; i < 4; ++i)                                                    \
       sad_array[i] =                                                           \
@@ -83,7 +71,6 @@
 
 // 32x32
 sadMxN(32, 32)
-sadMxNxK(32, 32, 8)
 sadMxNx4D(32, 32)
 
 // 32x16
@@ -96,26 +83,18 @@
 
 // 16x16
 sadMxN(16, 16)
-sadMxNxK(16, 16, 3)
-sadMxNxK(16, 16, 8)
 sadMxNx4D(16, 16)
 
 // 16x8
 sadMxN(16, 8)
-sadMxNxK(16, 8, 3)
-sadMxNxK(16, 8, 8)
 sadMxNx4D(16, 8)
 
 // 8x16
 sadMxN(8, 16)
-sadMxNxK(8, 16, 3)
-sadMxNxK(8, 16, 8)
 sadMxNx4D(8, 16)
 
 // 8x8
 sadMxN(8, 8)
-sadMxNxK(8, 8, 3)
-sadMxNxK(8, 8, 8)
 sadMxNx4D(8, 8)
 
 // 8x4
@@ -128,8 +107,6 @@
 
 // 4x4
 sadMxN(4, 4)
-sadMxNxK(4, 4, 3)
-sadMxNxK(4, 4, 8)
 sadMxNx4D(4, 4)
 /* clang-format on */
 
@@ -181,15 +158,15 @@
     return highbd_sadb(src_ptr, src_stride, comp_pred, m, m, n);               \
   }
 
-#define highbd_sadMxNx4D(m, n)                                                \
-  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride, \
-                                      const uint8_t *const ref_array[],       \
-                                      int ref_stride, uint32_t *sad_array) {  \
-    int i;                                                                    \
-    for (i = 0; i < 4; ++i) {                                                 \
-      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,         \
-                                                 ref_array[i], ref_stride);   \
-    }                                                                         \
+#define highbd_sadMxNx4D(m, n)                                                 \
+  void vpx_highbd_sad##m##x##n##x4d_c(const uint8_t *src_ptr, int src_stride,  \
+                                      const uint8_t *const ref_array[4],       \
+                                      int ref_stride, uint32_t sad_array[4]) { \
+    int i;                                                                     \
+    for (i = 0; i < 4; ++i) {                                                  \
+      sad_array[i] = vpx_highbd_sad##m##x##n##_c(src_ptr, src_stride,          \
+                                                 ref_array[i], ref_stride);    \
+    }                                                                          \
   }
 
 /* clang-format off */
diff --git a/libvpx/vpx_dsp/variance.h b/libvpx/vpx_dsp/variance.h
index f8b44f0..755cb90 100644
--- a/libvpx/vpx_dsp/variance.h
+++ b/libvpx/vpx_dsp/variance.h
@@ -59,8 +59,6 @@
   vpx_sad_fn_t sdf;
   vpx_variance_fn_t vf;
   vpx_subpixvariance_fn_t svf;
-  vpx_sad_multi_fn_t sdx3f;
-  vpx_sad_multi_fn_t sdx8f;
   vpx_sad_multi_d_fn_t sdx4df;
 #if VPX_ARCH_X86 || VPX_ARCH_X86_64
   vp8_copy32xn_fn_t copymem;
@@ -76,7 +74,6 @@
   vpx_subpixvariance_fn_t svf;
   vpx_subp_avg_variance_fn_t svaf;
   vpx_sad_multi_d_fn_t sdx4df;
-  vpx_sad_multi_fn_t sdx8f;
 } vp9_variance_fn_ptr_t;
 #endif  // CONFIG_VP9
 
diff --git a/libvpx/vpx_dsp/vpx_dsp.mk b/libvpx/vpx_dsp/vpx_dsp.mk
index 0165310..13999af 100644
--- a/libvpx/vpx_dsp/vpx_dsp.mk
+++ b/libvpx/vpx_dsp/vpx_dsp.mk
@@ -74,6 +74,7 @@
 DSP_SRCS-$(HAVE_NEON_ASM) += arm/intrapred_neon_asm$(ASM)
 DSP_SRCS-$(HAVE_NEON) += arm/intrapred_neon.c
 DSP_SRCS-$(HAVE_MSA) += mips/intrapred_msa.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/intrapred_lsx.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred4_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred8_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/intrapred16_dspr2.c
@@ -162,6 +163,17 @@
 
 DSP_SRCS-$(HAVE_VSX)  += ppc/vpx_convolve_vsx.c
 
+# common (lsx)
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_horiz_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_vert_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve8_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_avg_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_copy_lsx.c
+DSP_SRCS-$(HAVE_LSX) += loongarch/vpx_convolve_lsx.h
+
 # loop filters
 DSP_SRCS-yes += loopfilter.c
 
@@ -188,6 +200,11 @@
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_horiz_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2)  += mips/loopfilter_mb_vert_dspr2.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_16_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_8_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/loopfilter_4_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_NEON)   += arm/highbd_loopfilter_neon.c
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_loopfilter_sse2.c
@@ -197,6 +214,7 @@
 DSP_SRCS-yes            += txfm_common.h
 DSP_SRCS-$(HAVE_SSE2)   += x86/txfm_common_sse2.h
 DSP_SRCS-$(HAVE_MSA)    += mips/txfm_macros_msa.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/txfm_macros_lsx.h
 # forward transform
 ifeq ($(CONFIG_VP9_ENCODER),yes)
 DSP_SRCS-yes            += fwd_txfm.c
@@ -217,9 +235,12 @@
 DSP_SRCS-$(HAVE_NEON)   += arm/fwd_txfm_neon.c
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.h
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_txfm_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_txfm_lsx.c
 
 ifneq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_MSA)    += mips/fwd_dct32x32_msa.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/fwd_dct32x32_lsx.c
 endif  # !CONFIG_VP9_HIGHBITDEPTH
 
 DSP_SRCS-$(HAVE_VSX)    += ppc/fdct32x32_vsx.c
@@ -252,6 +273,8 @@
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans16_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_dspr2.c
 DSP_SRCS-$(HAVE_DSPR2) += mips/itrans32_cols_dspr2.c
+
+DSP_SRCS-$(HAVE_LSX)   += loongarch/idct32x32_lsx.c
 else  # CONFIG_VP9_HIGHBITDEPTH
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct4x4_add_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/highbd_idct8x8_add_neon.c
@@ -305,6 +328,7 @@
 DSP_SRCS-$(HAVE_AVX)    += x86/quantize_avx.c
 DSP_SRCS-$(HAVE_NEON)   += arm/quantize_neon.c
 DSP_SRCS-$(HAVE_VSX)    += ppc/quantize_vsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/quantize_lsx.c
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2)   += x86/highbd_quantize_intrin_sse2.c
 endif
@@ -316,6 +340,7 @@
 DSP_SRCS-$(HAVE_NEON)  += arm/avg_neon.c
 DSP_SRCS-$(HAVE_NEON)  += arm/hadamard_neon.c
 DSP_SRCS-$(HAVE_MSA)   += mips/avg_msa.c
+DSP_SRCS-$(HAVE_LSX)   += loongarch/avg_lsx.c
 ifeq ($(VPX_ARCH_X86_64),yes)
 DSP_SRCS-$(HAVE_SSSE3) += x86/avg_ssse3_x86_64.asm
 endif
@@ -342,12 +367,11 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/sad_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/subtract_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sad_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/sad_mmi.c
 DSP_SRCS-$(HAVE_MMI)    += mips/subtract_mmi.c
 
-DSP_SRCS-$(HAVE_SSE3)   += x86/sad_sse3.asm
-DSP_SRCS-$(HAVE_SSSE3)  += x86/sad_ssse3.asm
-DSP_SRCS-$(HAVE_SSE4_1) += x86/sad_sse4.asm
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad4d_avx2.c
 DSP_SRCS-$(HAVE_AVX2)   += x86/sad_avx2.c
 DSP_SRCS-$(HAVE_AVX512) += x86/sad4d_avx512.c
@@ -359,6 +383,8 @@
 DSP_SRCS-$(HAVE_VSX) += ppc/sad_vsx.c
 DSP_SRCS-$(HAVE_VSX) += ppc/subtract_vsx.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/subtract_lsx.c
+
 ifeq ($(CONFIG_VP9_HIGHBITDEPTH),yes)
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad4d_sse2.asm
 DSP_SRCS-$(HAVE_SSE2) += x86/highbd_sad_sse2.asm
@@ -377,6 +403,11 @@
 DSP_SRCS-$(HAVE_MSA)    += mips/variance_msa.c
 DSP_SRCS-$(HAVE_MSA)    += mips/sub_pixel_variance_msa.c
 
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.h
+DSP_SRCS-$(HAVE_LSX)    += loongarch/variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/sub_pixel_variance_lsx.c
+DSP_SRCS-$(HAVE_LSX)    += loongarch/avg_pred_lsx.c
+
 DSP_SRCS-$(HAVE_MMI)    += mips/variance_mmi.c
 
 DSP_SRCS-$(HAVE_SSE2)   += x86/avg_pred_sse2.c
@@ -413,6 +444,9 @@
 DSP_SRCS-$(HAVE_SSE2) += x86/mem_sse2.h
 DSP_SRCS-$(HAVE_SSE2) += x86/transpose_sse2.h
 
+# LSX utilities
+DSP_SRCS-$(HAVE_LSX)  += loongarch/bitdepth_conversion_lsx.h
+
 DSP_SRCS-no += $(DSP_SRCS_REMOVE-yes)
 
 DSP_SRCS-yes += vpx_dsp_rtcd.c
diff --git a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
index fd7eefd..d3c668f 100644
--- a/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
+++ b/libvpx/vpx_dsp/vpx_dsp_rtcd_defs.pl
@@ -117,7 +117,7 @@
 
 add_proto qw/void vpx_dc_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 # TODO(crbug.com/webm/1522): Re-enable vsx implementation.
-specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2/;
+specialize qw/vpx_dc_predictor_8x8 dspr2 neon msa sse2 lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_8x8/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_8x8 neon msa sse2/;
@@ -155,7 +155,7 @@
 specialize qw/vpx_tm_predictor_16x16 neon msa sse2 vsx/;
 
 add_proto qw/void vpx_dc_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
-specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx/;
+specialize qw/vpx_dc_predictor_16x16 dspr2 neon msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_dc_top_predictor_16x16/, "uint8_t *dst, ptrdiff_t stride, const uint8_t *above, const uint8_t *left";
 specialize qw/vpx_dc_top_predictor_16x16 neon msa sse2 vsx/;
@@ -368,28 +368,28 @@
 # Sub Pixel Filters
 #
 add_proto qw/void vpx_convolve_copy/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx/;
+specialize qw/vpx_convolve_copy neon dspr2 msa sse2 vsx lsx/;
 
 add_proto qw/void vpx_convolve_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi/;
+specialize qw/vpx_convolve_avg neon dspr2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8 sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_horiz/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_horiz sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_convolve8_avg_vert/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
-specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi/;
+specialize qw/vpx_convolve8_avg_vert sse2 ssse3 avx2 neon dspr2 msa vsx mmi lsx/;
 
 add_proto qw/void vpx_scaled_2d/, "const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h";
 specialize qw/vpx_scaled_2d ssse3 neon msa/;
@@ -442,37 +442,37 @@
 specialize qw/vpx_lpf_vertical_16 sse2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_vertical_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_16_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_vertical_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_vertical_4_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_16/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
 specialize qw/vpx_lpf_horizontal_16 sse2 avx2 neon dspr2 msa/;
 
 add_proto qw/void vpx_lpf_horizontal_16_dual/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_16_dual sse2 avx2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_8_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_8_dual sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4/, "uint8_t *s, int pitch, const uint8_t *blimit, const uint8_t *limit, const uint8_t *thresh";
-specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4 sse2 neon dspr2 msa lsx/;
 
 add_proto qw/void vpx_lpf_horizontal_4_dual/, "uint8_t *s, int pitch, const uint8_t *blimit0, const uint8_t *limit0, const uint8_t *thresh0, const uint8_t *blimit1, const uint8_t *limit1, const uint8_t *thresh1";
-specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa/;
+specialize qw/vpx_lpf_horizontal_4_dual sse2 neon dspr2 msa lsx/;
 } #CONFIG_VP9
 
 if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
@@ -573,28 +573,28 @@
   add_proto qw/void vpx_highbd_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
 } else {
   add_proto qw/void vpx_fdct4x4/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct4x4 neon sse2 msa/;
+  specialize qw/vpx_fdct4x4 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct4x4_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct4x4_1 sse2 neon/;
 
   add_proto qw/void vpx_fdct8x8/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct8x8 sse2 neon msa/, "$ssse3_x86_64";
+  specialize qw/vpx_fdct8x8 sse2 neon msa lsx/, "$ssse3_x86_64";
 
   add_proto qw/void vpx_fdct8x8_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct8x8_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct16x16/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct16x16 neon sse2 msa/;
+  specialize qw/vpx_fdct16x16 neon sse2 msa lsx/;
 
   add_proto qw/void vpx_fdct16x16_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct16x16_1 sse2 neon msa/;
 
   add_proto qw/void vpx_fdct32x32/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa/;
+  specialize qw/vpx_fdct32x32 neon sse2 avx2 msa lsx/;
 
   add_proto qw/void vpx_fdct32x32_rd/, "const int16_t *input, tran_low_t *output, int stride";
-  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_fdct32x32_rd sse2 avx2 neon msa vsx lsx/;
 
   add_proto qw/void vpx_fdct32x32_1/, "const int16_t *input, tran_low_t *output, int stride";
   specialize qw/vpx_fdct32x32_1 sse2 neon msa/;
@@ -652,12 +652,13 @@
     $vpx_idct16x16_38_add_msa=vpx_idct16x16_256_add_msa;
     specialize qw/vpx_idct16x16_10_add dspr2 msa/;
     specialize qw/vpx_idct16x16_1_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1024_add dspr2 msa/;
+    specialize qw/vpx_idct32x32_1024_add dspr2 msa lsx/;
     specialize qw/vpx_idct32x32_135_add dspr2 msa/;
     $vpx_idct32x32_135_add_dspr2=vpx_idct32x32_1024_add_dspr2;
     $vpx_idct32x32_135_add_msa=vpx_idct32x32_1024_add_msa;
-    specialize qw/vpx_idct32x32_34_add dspr2 msa/;
-    specialize qw/vpx_idct32x32_1_add dspr2 msa/;
+    $vpx_idct32x32_135_add_lsx=vpx_idct32x32_1024_add_lsx;
+    specialize qw/vpx_idct32x32_34_add dspr2 msa lsx/;
+    specialize qw/vpx_idct32x32_1_add dspr2 msa lsx/;
     specialize qw/vpx_iwht4x4_16_add msa/;
     specialize qw/vpx_iwht4x4_1_add msa/;
   } # !CONFIG_VP9_HIGHBITDEPTH
@@ -709,17 +710,17 @@
 # Quantization
 #
 if (vpx_config("CONFIG_VP9_ENCODER") eq "yes") {
-  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx/;
+  add_proto qw/void vpx_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b neon sse2 ssse3 avx vsx lsx/;
 
-  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
-  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx/;
+  add_proto qw/void vpx_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+  specialize qw/vpx_quantize_b_32x32 neon ssse3 avx vsx lsx/;
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
-    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b sse2/;
 
-    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
+    add_proto qw/void vpx_highbd_quantize_b_32x32/, "const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan";
     specialize qw/vpx_highbd_quantize_b_32x32 sse2/;
   }  # CONFIG_VP9_HIGHBITDEPTH
 }  # CONFIG_VP9_ENCODER
@@ -729,13 +730,13 @@
 # Block subtraction
 #
 add_proto qw/void vpx_subtract_block/, "int rows, int cols, int16_t *diff_ptr, ptrdiff_t diff_stride, const uint8_t *src_ptr, ptrdiff_t src_stride, const uint8_t *pred_ptr, ptrdiff_t pred_stride";
-specialize qw/vpx_subtract_block neon msa mmi sse2 vsx/;
+specialize qw/vpx_subtract_block neon msa mmi sse2 vsx lsx/;
 
 #
 # Single block SAD
 #
 add_proto qw/unsigned int vpx_sad64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64 neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad64x32 neon avx2 msa sse2 vsx mmi/;
@@ -744,7 +745,7 @@
 specialize qw/vpx_sad32x64 neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32 neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad32x16 neon avx2 msa sse2 vsx mmi/;
@@ -753,7 +754,7 @@
 specialize qw/vpx_sad16x32 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad16x16 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad16x8 neon msa sse2 vsx mmi/;
@@ -762,7 +763,7 @@
 specialize qw/vpx_sad8x16 neon msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
-specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi/;
+specialize qw/vpx_sad8x8 neon msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride";
 specialize qw/vpx_sad8x4 neon msa sse2 vsx mmi/;
@@ -788,10 +789,10 @@
 
   if (vpx_config("CONFIG_VP9_HIGHBITDEPTH") eq "yes") {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
@@ -812,10 +813,10 @@
     specialize qw/vpx_highbd_satd avx2/;
   } else {
     add_proto qw/void vpx_hadamard_8x8/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx/, "$ssse3_x86_64";
+    specialize qw/vpx_hadamard_8x8 sse2 neon msa vsx lsx/, "$ssse3_x86_64";
 
     add_proto qw/void vpx_hadamard_16x16/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
-    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx/;
+    specialize qw/vpx_hadamard_16x16 avx2 sse2 neon msa vsx lsx/;
 
     add_proto qw/void vpx_hadamard_32x32/, "const int16_t *src_diff, ptrdiff_t src_stride, int16_t *coeff";
     specialize qw/vpx_hadamard_32x32 sse2 avx2/;
@@ -824,7 +825,7 @@
     specialize qw/vpx_satd avx2 sse2 neon msa/;
   }
 
-  add_proto qw/void vpx_int_pro_row/, "int16_t *hbuf, const uint8_t *ref, const int ref_stride, const int height";
+  add_proto qw/void vpx_int_pro_row/, "int16_t hbuf[16], const uint8_t *ref, const int ref_stride, const int height";
   specialize qw/vpx_int_pro_row sse2 neon msa/;
 
   add_proto qw/int16_t vpx_int_pro_col/, "const uint8_t *ref, const int width";
@@ -835,7 +836,7 @@
 }  # CONFIG_VP9_ENCODER
 
 add_proto qw/unsigned int vpx_sad64x64_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad64x64_avg neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad64x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad64x32_avg neon avx2 msa sse2 vsx mmi/;
@@ -844,7 +845,7 @@
 specialize qw/vpx_sad32x64_avg neon avx2 msa sse2 vsx mmi/;
 
 add_proto qw/unsigned int vpx_sad32x32_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
-specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi/;
+specialize qw/vpx_sad32x32_avg neon avx2 msa sse2 vsx mmi lsx/;
 
 add_proto qw/unsigned int vpx_sad32x16_avg/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, const uint8_t *second_pred";
 specialize qw/vpx_sad32x16_avg neon avx2 msa sse2 vsx mmi/;
@@ -874,83 +875,45 @@
 specialize qw/vpx_sad4x4_avg neon msa sse2 mmi/;
 
 #
-# Multi-block SAD, comparing a reference to N blocks 1 pixel apart horizontally
-#
-# Blocks of 3
-add_proto qw/void vpx_sad16x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x3 sse3 ssse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x3 sse3 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x3/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x3 sse3 msa mmi/;
-
-# Blocks of 8
-add_proto qw/void vpx_sad32x32x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x8 avx2/;
-
-add_proto qw/void vpx_sad16x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad16x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x16x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad8x8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x8 sse4_1 msa mmi/;
-
-add_proto qw/void vpx_sad4x4x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad4x4x8 sse4_1 msa mmi/;
-
-#
 # Multi-block SAD, comparing a reference to N independent blocks
 #
-add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x64x4d avx512 avx2 neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad64x32x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x64x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad32x32x4d avx2 neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad32x16x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x32x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi/;
+add_proto qw/void vpx_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad16x16x4d neon msa sse2 vsx mmi lsx/;
 
-add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad16x8x4d neon msa sse2 vsx mmi/;
 
-add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x16x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
-specialize qw/vpx_sad8x8x4d neon msa sse2 mmi/;
+add_proto qw/void vpx_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
+specialize qw/vpx_sad8x8x4d neon msa sse2 mmi lsx/;
 
-add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad8x4x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x8x4d neon msa sse2 mmi/;
 
-add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[], int ref_stride, uint32_t *sad_array";
+add_proto qw/void vpx_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t * const ref_array[4], int ref_stride, uint32_t sad_array[4]";
 specialize qw/vpx_sad4x4x4d neon msa sse2 mmi/;
 
 add_proto qw/uint64_t vpx_sum_squares_2d_i16/, "const int16_t *src, int stride, int size";
@@ -1064,43 +1027,43 @@
   #
   # Multi-block SAD, comparing a reference to N independent blocks
   #
-  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad64x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad64x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x64x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x64x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad32x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad32x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x32x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x32x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad16x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad16x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x16x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x16x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad8x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad8x4x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x8x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x8x4d sse2/;
 
-  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[], int ref_stride, uint32_t *sad_array";
+  add_proto qw/void vpx_highbd_sad4x4x4d/, "const uint8_t *src_ptr, int src_stride, const uint8_t* const ref_array[4], int ref_stride, uint32_t sad_array[4]";
   specialize qw/vpx_highbd_sad4x4x4d sse2/;
 
   #
@@ -1118,7 +1081,7 @@
 # Variance
 #
 add_proto qw/unsigned int vpx_variance64x64/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance64x64 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance64x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance64x32 sse2 avx2 neon msa mmi vsx/;
@@ -1127,7 +1090,7 @@
   specialize qw/vpx_variance32x64 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance32x32/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance32x32 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance32x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance32x16 sse2 avx2 neon msa mmi vsx/;
@@ -1136,7 +1099,7 @@
   specialize qw/vpx_variance16x32 sse2 avx2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_variance16x16 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance16x8 sse2 avx2 neon msa mmi vsx/;
@@ -1145,7 +1108,7 @@
   specialize qw/vpx_variance8x16 sse2 neon msa mmi vsx/;
 
 add_proto qw/unsigned int vpx_variance8x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx/;
+  specialize qw/vpx_variance8x8 sse2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_variance8x4/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_variance8x4 sse2 neon msa mmi vsx/;
@@ -1160,13 +1123,13 @@
 # Specialty Variance
 #
 add_proto qw/void vpx_get16x16var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
-  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx/;
+  specialize qw/vpx_get16x16var sse2 avx2 neon msa vsx lsx/;
 
 add_proto qw/void vpx_get8x8var/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse, int *sum";
   specialize qw/vpx_get8x8var sse2 neon msa vsx/;
 
 add_proto qw/unsigned int vpx_mse16x16/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
-  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx/;
+  specialize qw/vpx_mse16x16 sse2 avx2 neon msa mmi vsx lsx/;
 
 add_proto qw/unsigned int vpx_mse16x8/, "const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *sse";
   specialize qw/vpx_mse16x8 sse2 avx2 msa mmi vsx/;
@@ -1184,7 +1147,7 @@
   specialize qw/vpx_get4x4sse_cs neon msa vsx/;
 
 add_proto qw/void vpx_comp_avg_pred/, "uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride";
-  specialize qw/vpx_comp_avg_pred neon sse2 vsx/;
+  specialize qw/vpx_comp_avg_pred neon sse2 vsx lsx/;
 
 #
 # Subpixel Variance
@@ -1199,7 +1162,7 @@
   specialize qw/vpx_sub_pixel_variance32x64 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance32x32 avx2 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance32x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance32x16 neon msa mmi sse2 ssse3/;
@@ -1208,7 +1171,7 @@
   specialize qw/vpx_sub_pixel_variance16x32 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x16/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance16x16 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance16x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance16x8 neon msa mmi sse2 ssse3/;
@@ -1217,7 +1180,7 @@
   specialize qw/vpx_sub_pixel_variance8x16 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x8/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
-  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_variance8x8 neon msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_variance8x4/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse";
   specialize qw/vpx_sub_pixel_variance8x4 neon msa mmi sse2 ssse3/;
@@ -1229,7 +1192,7 @@
   specialize qw/vpx_sub_pixel_variance4x4 neon msa mmi sse2 ssse3/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x64/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
-  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3/;
+  specialize qw/vpx_sub_pixel_avg_variance64x64 neon avx2 msa mmi sse2 ssse3 lsx/;
 
 add_proto qw/uint32_t vpx_sub_pixel_avg_variance64x32/, "const uint8_t *src_ptr, int src_stride, int x_offset, int y_offset, const uint8_t *ref_ptr, int ref_stride, uint32_t *sse, const uint8_t *second_pred";
   specialize qw/vpx_sub_pixel_avg_variance64x32 neon msa mmi sse2 ssse3/;
diff --git a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
index 3cba258..9da2f34 100644
--- a/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_intrin_sse2.c
@@ -464,7 +464,7 @@
   return _mm_cvtsi128_si32(accum);
 }
 
-void vpx_int_pro_row_sse2(int16_t *hbuf, const uint8_t *ref,
+void vpx_int_pro_row_sse2(int16_t hbuf[16], const uint8_t *ref,
                           const int ref_stride, const int height) {
   int idx;
   __m128i zero = _mm_setzero_si128();
diff --git a/libvpx/vpx_dsp/x86/avg_pred_sse2.c b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
index e4e1e0e..c6e70f7 100644
--- a/libvpx/vpx_dsp/x86/avg_pred_sse2.c
+++ b/libvpx/vpx_dsp/x86/avg_pred_sse2.c
@@ -46,9 +46,9 @@
         r = _mm_loadu_si128((const __m128i *)ref);
         ref += 16;
       } else if (width == 4) {
-        r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride),
-                          loadu_uint32(ref + 2 * ref_stride),
-                          loadu_uint32(ref + ref_stride), loadu_uint32(ref));
+        r = _mm_set_epi32(loadu_int32(ref + 3 * ref_stride),
+                          loadu_int32(ref + 2 * ref_stride),
+                          loadu_int32(ref + ref_stride), loadu_int32(ref));
 
         ref += 4 * ref_stride;
       } else {
diff --git a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
index 3209625..01a52ec 100644
--- a/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
+++ b/libvpx/vpx_dsp/x86/highbd_convolve_avx2.c
@@ -1465,10 +1465,10 @@
 #define vpx_highbd_filter_block1d4_h4_avg_avx2 \
   vpx_highbd_filter_block1d4_h8_avg_avx2
 
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , avx2, 0);
-HIGH_FUN_CONV_2D(, avx2, 0);
+                 src - src_stride * (num_taps / 2 - 1), , avx2, 0)
+HIGH_FUN_CONV_2D(, avx2, 0)
 
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
 highbd_filter8_1dfunction vpx_highbd_filter_block1d4_h8_avg_sse2;
@@ -1487,9 +1487,9 @@
 #define vpx_highbd_filter_block1d4_v2_avg_avx2 \
   vpx_highbd_filter_block1d4_v2_avg_sse2
 
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
-HIGH_FUN_CONV_2D(avg_, avx2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
+HIGH_FUN_CONV_2D(avg_, avx2, 1)
 
 #undef HIGHBD_FUNC
diff --git a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
index 7149e4f..4535a0f 100644
--- a/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_quantize_intrin_sse2.c
@@ -18,7 +18,7 @@
 
 #if CONFIG_VP9_HIGHBITDEPTH
 void vpx_highbd_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t count,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -39,8 +39,6 @@
   nzbins[1] = _mm_sub_epi32(nzbins[1], zbins[1]);
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr));
   memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr));
@@ -94,8 +92,8 @@
 }
 
 void vpx_highbd_quantize_b_32x32_sse2(
-    const tran_low_t *coeff_ptr, intptr_t n_coeffs, int skip_block,
-    const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr,
+    const tran_low_t *coeff_ptr, intptr_t n_coeffs, const int16_t *zbin_ptr,
+    const int16_t *round_ptr, const int16_t *quant_ptr,
     const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
     tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr,
     const int16_t *scan, const int16_t *iscan) {
@@ -107,8 +105,6 @@
   const int zbin0_tmp = ROUND_POWER_OF_TWO(zbin_ptr[0], 1);
   const int zbin1_tmp = ROUND_POWER_OF_TWO(zbin_ptr[1], 1);
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   zbins[0] = _mm_set_epi32(zbin1_tmp, zbin1_tmp, zbin1_tmp, zbin0_tmp);
   zbins[1] = _mm_set1_epi32(zbin1_tmp);
diff --git a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
index dd6cfbb..7c8d79b 100644
--- a/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/highbd_variance_sse2.c
@@ -121,8 +121,8 @@
     *sse = ROUND_POWER_OF_TWO(*sse, 8);                                       \
   }
 
-HIGH_GET_VAR(16);
-HIGH_GET_VAR(8);
+HIGH_GET_VAR(16)
+HIGH_GET_VAR(8)
 
 #undef HIGH_GET_VAR
 
@@ -167,16 +167,16 @@
     return (var >= 0) ? (uint32_t)var : 0;                                 \
   }
 
-VAR_FN(64, 64, 16, 12);
-VAR_FN(64, 32, 16, 11);
-VAR_FN(32, 64, 16, 11);
-VAR_FN(32, 32, 16, 10);
-VAR_FN(32, 16, 16, 9);
-VAR_FN(16, 32, 16, 9);
-VAR_FN(16, 16, 16, 8);
-VAR_FN(16, 8, 8, 7);
-VAR_FN(8, 16, 8, 7);
-VAR_FN(8, 8, 8, 6);
+VAR_FN(64, 64, 16, 12)
+VAR_FN(64, 32, 16, 11)
+VAR_FN(32, 64, 16, 11)
+VAR_FN(32, 32, 16, 10)
+VAR_FN(32, 16, 16, 9)
+VAR_FN(16, 32, 16, 9)
+VAR_FN(16, 16, 16, 8)
+VAR_FN(16, 8, 8, 7)
+VAR_FN(8, 16, 8, 7)
+VAR_FN(8, 8, 8, 6)
 
 #undef VAR_FN
 
@@ -255,10 +255,10 @@
       const uint16_t *ref, ptrdiff_t ref_stride, int height,                 \
       unsigned int *sse, void *unused0, void *unused);
 #define DECLS(opt) \
-  DECL(8, opt);    \
+  DECL(8, opt)     \
   DECL(16, opt)
 
-DECLS(sse2);
+DECLS(sse2)
 
 #undef DECLS
 #undef DECL
@@ -383,20 +383,20 @@
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt)                        \
-  FN(64, 64, 16, 6, 6, opt, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt, (int64_t));  \
-  FN(8, 16, 8, 3, 4, opt, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt, (int64_t));
+#define FNS(opt)                       \
+  FN(64, 64, 16, 6, 6, opt, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt, (int64_t))  \
+  FN(8, 16, 8, 3, 4, opt, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt, (int64_t))
 
-FNS(sse2);
+FNS(sse2)
 
 #undef FNS
 #undef FN
@@ -412,7 +412,7 @@
   DECL(16, opt1)    \
   DECL(8, opt1)
 
-DECLS(sse2);
+DECLS(sse2)
 #undef DECL
 #undef DECLS
 
@@ -542,20 +542,20 @@
     return (var >= 0) ? (uint32_t)var : 0;                                     \
   }
 
-#define FNS(opt1)                        \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t)); \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t)); \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t)); \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t)); \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t)); \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t)); \
-  FN(16, 16, 16, 4, 4, opt1, (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int64_t));  \
-  FN(8, 16, 8, 4, 3, opt1, (int64_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (int64_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (int64_t));
+#define FNS(opt1)                       \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t)) \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t)) \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t)) \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t)) \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t)) \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t)) \
+  FN(16, 16, 16, 4, 4, opt1, (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int64_t))  \
+  FN(8, 16, 8, 4, 3, opt1, (int64_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (int64_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (int64_t))
 
-FNS(sse2);
+FNS(sse2)
 
 #undef FNS
 #undef FN
diff --git a/libvpx/vpx_dsp/x86/loopfilter_sse2.c b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
index b6ff248..347c9fd 100644
--- a/libvpx/vpx_dsp/x86/loopfilter_sse2.c
+++ b/libvpx/vpx_dsp/x86/loopfilter_sse2.c
@@ -211,21 +211,21 @@
   // 00 10 20 30 01 11 21 31  02 12 22 32 03 13 23 33
   ps1ps0 = _mm_unpacklo_epi8(ps1ps0, x0);
 
-  storeu_uint32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 0 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 1 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 2 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
   ps1ps0 = _mm_srli_si128(ps1ps0, 4);
-  storeu_uint32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
+  storeu_int32(s + 3 * pitch - 2, _mm_cvtsi128_si32(ps1ps0));
 
-  storeu_uint32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 4 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 5 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 6 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
   qs1qs0 = _mm_srli_si128(qs1qs0, 4);
-  storeu_uint32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
+  storeu_int32(s + 7 * pitch - 2, _mm_cvtsi128_si32(qs1qs0));
 }
 
 void vpx_lpf_horizontal_16_sse2(unsigned char *s, int pitch,
diff --git a/libvpx/vpx_dsp/x86/mem_sse2.h b/libvpx/vpx_dsp/x86/mem_sse2.h
index 258ab38..8b6d4d1 100644
--- a/libvpx/vpx_dsp/x86/mem_sse2.h
+++ b/libvpx/vpx_dsp/x86/mem_sse2.h
@@ -16,12 +16,12 @@
 
 #include "./vpx_config.h"
 
-static INLINE void storeu_uint32(void *dst, uint32_t v) {
+static INLINE void storeu_int32(void *dst, int32_t v) {
   memcpy(dst, &v, sizeof(v));
 }
 
-static INLINE uint32_t loadu_uint32(const void *src) {
-  uint32_t v;
+static INLINE int32_t loadu_int32(const void *src) {
+  int32_t v;
   memcpy(&v, src, sizeof(v));
   return v;
 }
diff --git a/libvpx/vpx_dsp/x86/quantize_avx.c b/libvpx/vpx_dsp/x86/quantize_avx.c
index 0a91d36..706e4e6 100644
--- a/libvpx/vpx_dsp/x86/quantize_avx.c
+++ b/libvpx/vpx_dsp/x86/quantize_avx.c
@@ -21,8 +21,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                        int skip_block, const int16_t *zbin_ptr,
-                        const int16_t *round_ptr, const int16_t *quant_ptr,
+                        const int16_t *zbin_ptr, const int16_t *round_ptr,
+                        const int16_t *quant_ptr,
                         const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                         tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                         uint16_t *eob_ptr, const int16_t *scan,
@@ -39,8 +39,6 @@
   __m128i eob = zero, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   *eob_ptr = 0;
 
@@ -145,8 +143,7 @@
 }
 
 void vpx_quantize_b_32x32_avx(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                              int skip_block, const int16_t *zbin_ptr,
-                              const int16_t *round_ptr,
+                              const int16_t *zbin_ptr, const int16_t *round_ptr,
                               const int16_t *quant_ptr,
                               const int16_t *quant_shift_ptr,
                               tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
@@ -166,8 +163,6 @@
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/quantize_sse2.c b/libvpx/vpx_dsp/x86/quantize_sse2.c
index e38a405..459d95f 100644
--- a/libvpx/vpx_dsp/x86/quantize_sse2.c
+++ b/libvpx/vpx_dsp/x86/quantize_sse2.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_sse2.h"
 
 void vpx_quantize_b_sse2(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                         int skip_block, const int16_t *zbin_ptr,
-                         const int16_t *round_ptr, const int16_t *quant_ptr,
+                         const int16_t *zbin_ptr, const int16_t *round_ptr,
+                         const int16_t *quant_ptr,
                          const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr,
                          tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr,
                          uint16_t *eob_ptr, const int16_t *scan,
@@ -34,8 +34,6 @@
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
diff --git a/libvpx/vpx_dsp/x86/quantize_ssse3.c b/libvpx/vpx_dsp/x86/quantize_ssse3.c
index fc1d919..9d2a88b 100644
--- a/libvpx/vpx_dsp/x86/quantize_ssse3.c
+++ b/libvpx/vpx_dsp/x86/quantize_ssse3.c
@@ -18,8 +18,8 @@
 #include "vpx_dsp/x86/quantize_ssse3.h"
 
 void vpx_quantize_b_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                          int skip_block, const int16_t *zbin_ptr,
-                          const int16_t *round_ptr, const int16_t *quant_ptr,
+                          const int16_t *zbin_ptr, const int16_t *round_ptr,
+                          const int16_t *quant_ptr,
                           const int16_t *quant_shift_ptr,
                           tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr,
                           const int16_t *dequant_ptr, uint16_t *eob_ptr,
@@ -34,8 +34,6 @@
   __m128i eob, eob0;
 
   (void)scan;
-  (void)skip_block;
-  assert(!skip_block);
 
   load_b_values(zbin_ptr, &zbin, round_ptr, &round, quant_ptr, &quant,
                 dequant_ptr, &dequant, quant_shift_ptr, &shift);
@@ -111,7 +109,7 @@
 }
 
 void vpx_quantize_b_32x32_ssse3(const tran_low_t *coeff_ptr, intptr_t n_coeffs,
-                                int skip_block, const int16_t *zbin_ptr,
+                                const int16_t *zbin_ptr,
                                 const int16_t *round_ptr,
                                 const int16_t *quant_ptr,
                                 const int16_t *quant_shift_ptr,
@@ -131,8 +129,6 @@
 
   (void)scan;
   (void)n_coeffs;
-  (void)skip_block;
-  assert(!skip_block);
 
   // Setup global values.
   // The 32x32 halves zbin and round.
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx2.c b/libvpx/vpx_dsp/x86/sad4d_avx2.c
index a5c4f8c..399b67b 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx2.c
@@ -11,8 +11,12 @@
 #include "./vpx_dsp_rtcd.h"
 #include "vpx/vpx_integer.h"
 
+// Note with sums[4] some versions of Visual Studio may fail due to parameter
+// alignment, though the functions should be equivalent:
+// error C2719: 'sums': formal parameter with requested alignment of 32 won't be
+// aligned
 static INLINE void calc_final_4(const __m256i *const sums /*[4]*/,
-                                uint32_t *sad_array) {
+                                uint32_t sad_array[4]) {
   const __m256i t0 = _mm256_hadd_epi32(sums[0], sums[1]);
   const __m256i t1 = _mm256_hadd_epi32(sums[2], sums[3]);
   const __m256i t2 = _mm256_hadd_epi32(t0, t1);
@@ -69,63 +73,6 @@
   calc_final_4(sums, sad_array);
 }
 
-void vpx_sad32x32x8_avx2(const uint8_t *src_ptr, int src_stride,
-                         const uint8_t *ref_ptr, int ref_stride,
-                         uint32_t *sad_array) {
-  int i;
-  __m256i sums[8];
-
-  sums[0] = _mm256_setzero_si256();
-  sums[1] = _mm256_setzero_si256();
-  sums[2] = _mm256_setzero_si256();
-  sums[3] = _mm256_setzero_si256();
-  sums[4] = _mm256_setzero_si256();
-  sums[5] = _mm256_setzero_si256();
-  sums[6] = _mm256_setzero_si256();
-  sums[7] = _mm256_setzero_si256();
-
-  for (i = 0; i < 32; i++) {
-    __m256i r[8];
-
-    // load src and all ref[]
-    const __m256i s = _mm256_load_si256((const __m256i *)src_ptr);
-    r[0] = _mm256_loadu_si256((const __m256i *)&ref_ptr[0]);
-    r[1] = _mm256_loadu_si256((const __m256i *)&ref_ptr[1]);
-    r[2] = _mm256_loadu_si256((const __m256i *)&ref_ptr[2]);
-    r[3] = _mm256_loadu_si256((const __m256i *)&ref_ptr[3]);
-    r[4] = _mm256_loadu_si256((const __m256i *)&ref_ptr[4]);
-    r[5] = _mm256_loadu_si256((const __m256i *)&ref_ptr[5]);
-    r[6] = _mm256_loadu_si256((const __m256i *)&ref_ptr[6]);
-    r[7] = _mm256_loadu_si256((const __m256i *)&ref_ptr[7]);
-
-    // sum of the absolute differences between every ref[] to src
-    r[0] = _mm256_sad_epu8(r[0], s);
-    r[1] = _mm256_sad_epu8(r[1], s);
-    r[2] = _mm256_sad_epu8(r[2], s);
-    r[3] = _mm256_sad_epu8(r[3], s);
-    r[4] = _mm256_sad_epu8(r[4], s);
-    r[5] = _mm256_sad_epu8(r[5], s);
-    r[6] = _mm256_sad_epu8(r[6], s);
-    r[7] = _mm256_sad_epu8(r[7], s);
-
-    // sum every ref[]
-    sums[0] = _mm256_add_epi32(sums[0], r[0]);
-    sums[1] = _mm256_add_epi32(sums[1], r[1]);
-    sums[2] = _mm256_add_epi32(sums[2], r[2]);
-    sums[3] = _mm256_add_epi32(sums[3], r[3]);
-    sums[4] = _mm256_add_epi32(sums[4], r[4]);
-    sums[5] = _mm256_add_epi32(sums[5], r[5]);
-    sums[6] = _mm256_add_epi32(sums[6], r[6]);
-    sums[7] = _mm256_add_epi32(sums[7], r[7]);
-
-    src_ptr += src_stride;
-    ref_ptr += ref_stride;
-  }
-
-  calc_final_4(sums, sad_array);
-  calc_final_4(sums + 4, sad_array + 4);
-}
-
 void vpx_sad64x64x4d_avx2(const uint8_t *src_ptr, int src_stride,
                           const uint8_t *const ref_array[4], int ref_stride,
                           uint32_t sad_array[4]) {
diff --git a/libvpx/vpx_dsp/x86/sad4d_avx512.c b/libvpx/vpx_dsp/x86/sad4d_avx512.c
index 4c5d704..cfd23fe 100644
--- a/libvpx/vpx_dsp/x86/sad4d_avx512.c
+++ b/libvpx/vpx_dsp/x86/sad4d_avx512.c
@@ -13,7 +13,7 @@
 
 void vpx_sad64x64x4d_avx512(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *const ref_array[4], int ref_stride,
-                            uint32_t res[4]) {
+                            uint32_t sad_array[4]) {
   __m512i src_reg, ref0_reg, ref1_reg, ref2_reg, ref3_reg;
   __m512i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
   __m512i sum_mlow, sum_mhigh;
@@ -78,6 +78,6 @@
     sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum256),
                            _mm256_extractf128_si256(sum256, 1));
 
-    _mm_storeu_si128((__m128i *)(res), sum128);
+    _mm_storeu_si128((__m128i *)(sad_array), sum128);
   }
 }
diff --git a/libvpx/vpx_dsp/x86/sad_avx2.c b/libvpx/vpx_dsp/x86/sad_avx2.c
index d944134..3b48acd 100644
--- a/libvpx/vpx_dsp/x86/sad_avx2.c
+++ b/libvpx/vpx_dsp/x86/sad_avx2.c
@@ -71,17 +71,17 @@
     return res;                                                               \
   }
 
-#define FSAD64  \
-  FSAD64_H(64); \
-  FSAD64_H(32);
+#define FSAD64 \
+  FSAD64_H(64) \
+  FSAD64_H(32)
 
-#define FSAD32  \
-  FSAD32_H(64); \
-  FSAD32_H(32); \
-  FSAD32_H(16);
+#define FSAD32 \
+  FSAD32_H(64) \
+  FSAD32_H(32) \
+  FSAD32_H(16)
 
-FSAD64;
-FSAD32;
+FSAD64
+FSAD32
 
 #undef FSAD64
 #undef FSAD32
@@ -160,17 +160,17 @@
     return res;                                                               \
   }
 
-#define FSADAVG64  \
-  FSADAVG64_H(64); \
-  FSADAVG64_H(32);
+#define FSADAVG64 \
+  FSADAVG64_H(64) \
+  FSADAVG64_H(32)
 
-#define FSADAVG32  \
-  FSADAVG32_H(64); \
-  FSADAVG32_H(32); \
-  FSADAVG32_H(16);
+#define FSADAVG32 \
+  FSADAVG32_H(64) \
+  FSADAVG32_H(32) \
+  FSADAVG32_H(16)
 
-FSADAVG64;
-FSADAVG32;
+FSADAVG64
+FSADAVG32
 
 #undef FSADAVG64
 #undef FSADAVG32
diff --git a/libvpx/vpx_dsp/x86/sad_sse3.asm b/libvpx/vpx_dsp/x86/sad_sse3.asm
deleted file mode 100644
index acbd2e4..0000000
--- a/libvpx/vpx_dsp/x86/sad_sse3.asm
+++ /dev/null
@@ -1,376 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro STACK_FRAME_CREATE_X3 0
-%if ABI_IS_32BIT
-  %define     src_ptr       rsi
-  %define     src_stride    rax
-  %define     ref_ptr       rdi
-  %define     ref_stride    rdx
-  %define     end_ptr       rcx
-  %define     ret_var       rbx
-  %define     result_ptr    arg(4)
-  %define     height        dword ptr arg(4)
-    push        rbp
-    mov         rbp,        rsp
-    push        rsi
-    push        rdi
-    push        rbx
-
-    mov         rsi,        arg(0)              ; src_ptr
-    mov         rdi,        arg(2)              ; ref_ptr
-
-    movsxd      rax,        dword ptr arg(1)    ; src_stride
-    movsxd      rdx,        dword ptr arg(3)    ; ref_stride
-%else
-  %if LIBVPX_YASM_WIN64
-    SAVE_XMM 7, u
-    %define     src_ptr     rcx
-    %define     src_stride  rdx
-    %define     ref_ptr     r8
-    %define     ref_stride  r9
-    %define     end_ptr     r10
-    %define     ret_var     r11
-    %define     result_ptr  [rsp+xmm_stack_space+8+4*8]
-    %define     height      dword ptr [rsp+xmm_stack_space+8+4*8]
-  %else
-    %define     src_ptr     rdi
-    %define     src_stride  rsi
-    %define     ref_ptr     rdx
-    %define     ref_stride  rcx
-    %define     end_ptr     r9
-    %define     ret_var     r10
-    %define     result_ptr  r8
-    %define     height      r8
-  %endif
-%endif
-
-%endmacro
-
-%macro STACK_FRAME_DESTROY_X3 0
-  %define     src_ptr
-  %define     src_stride
-  %define     ref_ptr
-  %define     ref_stride
-  %define     end_ptr
-  %define     ret_var
-  %define     result_ptr
-  %define     height
-
-%if ABI_IS_32BIT
-    pop         rbx
-    pop         rdi
-    pop         rsi
-    pop         rbp
-%else
-  %if LIBVPX_YASM_WIN64
-    RESTORE_XMM
-  %endif
-%endif
-    ret
-%endmacro
-
-%macro PROCESS_16X2X3 5
-%if %1==0
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm5,       XMMWORD PTR [%3]
-        lddqu           xmm6,       XMMWORD PTR [%3+1]
-        lddqu           xmm7,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [%2]
-        lddqu           xmm1,       XMMWORD PTR [%3]
-        lddqu           xmm2,       XMMWORD PTR [%3+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [%2+%4]
-        lddqu           xmm1,       XMMWORD PTR [%3+%5]
-        lddqu           xmm2,       XMMWORD PTR [%3+%5+1]
-        lddqu           xmm3,       XMMWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,         [%2+%4*2]
-        lea             %3,         [%3+%5*2]
-%endif
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_8X2X3 5
-%if %1==0
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm5,       QWORD PTR [%3]
-        movq            mm6,       QWORD PTR [%3+1]
-        movq            mm7,       QWORD PTR [%3+2]
-
-        psadbw          mm5,       mm0
-        psadbw          mm6,       mm0
-        psadbw          mm7,       mm0
-%else
-        movq            mm0,       QWORD PTR [%2]
-        movq            mm1,       QWORD PTR [%3]
-        movq            mm2,       QWORD PTR [%3+1]
-        movq            mm3,       QWORD PTR [%3+2]
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endif
-        movq            mm0,       QWORD PTR [%2+%4]
-        movq            mm1,       QWORD PTR [%3+%5]
-        movq            mm2,       QWORD PTR [%3+%5+1]
-        movq            mm3,       QWORD PTR [%3+%5+2]
-
-%if %1==0 || %1==1
-        lea             %2,        [%2+%4*2]
-        lea             %3,        [%3+%5*2]
-%endif
-
-        psadbw          mm1,       mm0
-        psadbw          mm2,       mm0
-        psadbw          mm3,       mm0
-
-        paddw           mm5,       mm1
-        paddw           mm6,       mm2
-        paddw           mm7,       mm3
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_sse3)
-sym(vpx_sad16x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad16x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_sse3)
-sym(vpx_sad16x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_16X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_16X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rcx],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rcx+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rcx+8],    xmm0
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x16x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x16x3_sse3)
-sym(vpx_sad8x16x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad8x8x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad8x8x3_sse3)
-sym(vpx_sad8x8x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        PROCESS_8X2X3 0, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 1, src_ptr, ref_ptr, src_stride, ref_stride
-        PROCESS_8X2X3 2, src_ptr, ref_ptr, src_stride, ref_stride
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm5,        mm6
-
-        movq            [rcx],      mm5
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
-
-;void int vpx_sad4x4x3_sse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad4x4x3_sse3)
-sym(vpx_sad4x4x3_sse3):
-
-    STACK_FRAME_CREATE_X3
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm1,        DWORD PTR [ref_ptr]
-
-        movd            mm2,        DWORD PTR [src_ptr+src_stride]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm2
-        punpcklbw       mm1,        mm3
-
-        movd            mm4,        DWORD PTR [ref_ptr+1]
-        movd            mm5,        DWORD PTR [ref_ptr+2]
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm3,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        psadbw          mm1,        mm0
-
-        punpcklbw       mm4,        mm2
-        punpcklbw       mm5,        mm3
-
-        psadbw          mm4,        mm0
-        psadbw          mm5,        mm0
-
-        lea             src_ptr,    [src_ptr+src_stride*2]
-        lea             ref_ptr,    [ref_ptr+ref_stride*2]
-
-        movd            mm0,        DWORD PTR [src_ptr]
-        movd            mm2,        DWORD PTR [ref_ptr]
-
-        movd            mm3,        DWORD PTR [src_ptr+src_stride]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride]
-
-        punpcklbw       mm0,        mm3
-        punpcklbw       mm2,        mm6
-
-        movd            mm3,        DWORD PTR [ref_ptr+1]
-        movd            mm7,        DWORD PTR [ref_ptr+2]
-
-        psadbw          mm2,        mm0
-
-        paddw           mm1,        mm2
-
-        movd            mm2,        DWORD PTR [ref_ptr+ref_stride+1]
-        movd            mm6,        DWORD PTR [ref_ptr+ref_stride+2]
-
-        punpcklbw       mm3,        mm2
-        punpcklbw       mm7,        mm6
-
-        psadbw          mm3,        mm0
-        psadbw          mm7,        mm0
-
-        paddw           mm3,        mm4
-        paddw           mm7,        mm5
-
-        mov             rcx,        result_ptr
-
-        punpckldq       mm1,        mm3
-
-        movq            [rcx],      mm1
-        movd            [rcx+8],    mm7
-
-    STACK_FRAME_DESTROY_X3
diff --git a/libvpx/vpx_dsp/x86/sad_sse4.asm b/libvpx/vpx_dsp/x86/sad_sse4.asm
deleted file mode 100644
index 0818ed5..0000000
--- a/libvpx/vpx_dsp/x86/sad_sse4.asm
+++ /dev/null
@@ -1,361 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X8 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm1,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm1,       xmm2
-        paddw           xmm1,       xmm3
-        paddw           xmm1,       xmm4
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        movq            xmm2,       MMWORD PTR [rdi+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        movq            xmm2,       MMWORD PTR [rdi+ rdx+16]
-        punpcklqdq      xmm5,       xmm3
-        punpcklqdq      xmm3,       xmm2
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-
-        psrldq          xmm0,       8
-        movdqa          xmm4,       xmm3
-        mpsadbw         xmm3,       xmm0,  0x0
-        mpsadbw         xmm4,       xmm0,  0x5
-
-        paddw           xmm5,       xmm2
-        paddw           xmm5,       xmm3
-        paddw           xmm5,       xmm4
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_8X2X8 1
-%if %1
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        movdqa          xmm2,       xmm1
-        mpsadbw         xmm1,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm1,       xmm2
-%else
-        movq            xmm0,       MMWORD PTR [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endif
-        movq            xmm0,       MMWORD PTR [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        movdqa          xmm2,       xmm5
-        mpsadbw         xmm5,       xmm0,  0x0
-        mpsadbw         xmm2,       xmm0,  0x5
-        paddw           xmm5,       xmm2
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro PROCESS_4X2X8 1
-%if %1
-        movd            xmm0,       [rsi]
-        movq            xmm1,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm1,       xmm3
-
-        mpsadbw         xmm1,       xmm0,  0x0
-%else
-        movd            xmm0,       [rsi]
-        movq            xmm5,       MMWORD PTR [rdi]
-        movq            xmm3,       MMWORD PTR [rdi+8]
-        punpcklqdq      xmm5,       xmm3
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endif
-        movd            xmm0,       [rsi + rax]
-        movq            xmm5,       MMWORD PTR [rdi+ rdx]
-        movq            xmm3,       MMWORD PTR [rdi+ rdx+8]
-        punpcklqdq      xmm5,       xmm3
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        mpsadbw         xmm5,       xmm0,  0x0
-
-        paddw           xmm1,       xmm5
-%endmacro
-
-%macro WRITE_AS_INTS 0
-    mov             rdi,        arg(4)           ;Results
-    pxor            xmm0, xmm0
-    movdqa          xmm2, xmm1
-    punpcklwd       xmm1, xmm0
-    punpckhwd       xmm2, xmm0
-
-    movdqa          [rdi],    xmm1
-    movdqa          [rdi + 16],    xmm2
-%endmacro
-
-SECTION .text
-
-;void vpx_sad16x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array);
-globalsym(vpx_sad16x16x8_sse4_1)
-sym(vpx_sad16x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad16x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad16x8x8_sse4_1)
-sym(vpx_sad16x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_16X2X8 1
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-    PROCESS_16X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x8x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x8x8_sse4_1)
-sym(vpx_sad8x8x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad8x16x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad8x16x8_sse4_1)
-sym(vpx_sad8x16x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_8X2X8 1
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-    PROCESS_8X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-;void vpx_sad4x4x8_sse4_1(
-;    const unsigned char *src_ptr,
-;    int  src_stride,
-;    const unsigned char *ref_ptr,
-;    int  ref_stride,
-;    unsigned short *sad_array
-;);
-globalsym(vpx_sad4x4x8_sse4_1)
-sym(vpx_sad4x4x8_sse4_1):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    push        rsi
-    push        rdi
-    ; end prolog
-
-    mov             rsi,        arg(0)           ;src_ptr
-    mov             rdi,        arg(2)           ;ref_ptr
-
-    movsxd          rax,        dword ptr arg(1) ;src_stride
-    movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-    PROCESS_4X2X8 1
-    PROCESS_4X2X8 0
-
-    WRITE_AS_INTS
-
-    ; begin epilog
-    pop         rdi
-    pop         rsi
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-
-
-
diff --git a/libvpx/vpx_dsp/x86/sad_ssse3.asm b/libvpx/vpx_dsp/x86/sad_ssse3.asm
deleted file mode 100644
index a5bc6d7..0000000
--- a/libvpx/vpx_dsp/x86/sad_ssse3.asm
+++ /dev/null
@@ -1,372 +0,0 @@
-;
-;  Copyright (c) 2010 The WebM project authors. All Rights Reserved.
-;
-;  Use of this source code is governed by a BSD-style license
-;  that can be found in the LICENSE file in the root of the source
-;  tree. An additional intellectual property rights grant can be found
-;  in the file PATENTS.  All contributing project authors may
-;  be found in the AUTHORS file in the root of the source tree.
-;
-
-
-%include "vpx_ports/x86_abi_support.asm"
-
-%macro PROCESS_16X2X3 1
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm5,       XMMWORD PTR [rdi]
-        lddqu           xmm6,       XMMWORD PTR [rdi+1]
-        lddqu           xmm7,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        lddqu           xmm1,       XMMWORD PTR [rdi]
-        lddqu           xmm2,       XMMWORD PTR [rdi+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        lddqu           xmm1,       XMMWORD PTR [rdi+rdx]
-        lddqu           xmm2,       XMMWORD PTR [rdi+rdx+1]
-        lddqu           xmm3,       XMMWORD PTR [rdi+rdx+2]
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X2X3_OFFSET 2
-%if %1
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm7,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm5,       xmm7
-        palignr         xmm5,       xmm4,       %2
-
-        movdqa          xmm6,       xmm7
-        palignr         xmm6,       xmm4,       (%2+1)
-
-        palignr         xmm7,       xmm4,       (%2+2)
-
-        psadbw          xmm5,       xmm0
-        psadbw          xmm6,       xmm0
-        psadbw          xmm7,       xmm0
-%else
-        movdqa          xmm0,       XMMWORD PTR [rsi]
-        movdqa          xmm4,       XMMWORD PTR [rdi]
-        movdqa          xmm3,       XMMWORD PTR [rdi+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endif
-        movdqa          xmm0,       XMMWORD PTR [rsi+rax]
-        movdqa          xmm4,       XMMWORD PTR [rdi+rdx]
-        movdqa          xmm3,       XMMWORD PTR [rdi+rdx+16]
-
-        movdqa          xmm1,       xmm3
-        palignr         xmm1,       xmm4,       %2
-
-        movdqa          xmm2,       xmm3
-        palignr         xmm2,       xmm4,       (%2+1)
-
-        palignr         xmm3,       xmm4,       (%2+2)
-
-        lea             rsi,        [rsi+rax*2]
-        lea             rdi,        [rdi+rdx*2]
-
-        psadbw          xmm1,       xmm0
-        psadbw          xmm2,       xmm0
-        psadbw          xmm3,       xmm0
-
-        paddw           xmm5,       xmm1
-        paddw           xmm6,       xmm2
-        paddw           xmm7,       xmm3
-%endmacro
-
-%macro PROCESS_16X16X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-%macro PROCESS_16X8X3_OFFSET 2
-%2_aligned_by_%1:
-
-        sub             rdi,        %1
-
-        PROCESS_16X2X3_OFFSET 1, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-        PROCESS_16X2X3_OFFSET 0, %1
-
-        jmp             %2_store_off
-
-%endmacro
-
-SECTION .text
-
-;void int vpx_sad16x16x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x16x3_ssse3)
-sym(vpx_sad16x16x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x16x3_ssse3_skiptable
-.vpx_sad16x16x3_ssse3_jumptable:
-        dd .vpx_sad16x16x3_ssse3_aligned_by_0  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_1  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_2  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_3  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_4  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_5  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_6  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_7  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_8  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_9  - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_10 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_11 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_12 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_13 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_14 - .vpx_sad16x16x3_ssse3_do_jump
-        dd .vpx_sad16x16x3_ssse3_aligned_by_15 - .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_skiptable:
-
-        call .vpx_sad16x16x3_ssse3_do_jump
-.vpx_sad16x16x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x16x3_ssse3_jumptable - .vpx_sad16x16x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x16x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X16X3_OFFSET 0,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 1,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 2,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 3,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 4,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 5,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 6,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 7,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 8,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 9,  .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 10, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 11, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 12, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 13, .vpx_sad16x16x3_ssse3
-        PROCESS_16X16X3_OFFSET 14, .vpx_sad16x16x3_ssse3
-
-.vpx_sad16x16x3_ssse3_aligned_by_15:
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x16x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
-
-;void int vpx_sad16x8x3_ssse3(
-;    unsigned char *src_ptr,
-;    int  src_stride,
-;    unsigned char *ref_ptr,
-;    int  ref_stride,
-;    int  *results)
-globalsym(vpx_sad16x8x3_ssse3)
-sym(vpx_sad16x8x3_ssse3):
-    push        rbp
-    mov         rbp, rsp
-    SHADOW_ARGS_TO_STACK 5
-    SAVE_XMM 7
-    push        rsi
-    push        rdi
-    push        rcx
-    ; end prolog
-
-        mov             rsi,        arg(0) ;src_ptr
-        mov             rdi,        arg(2) ;ref_ptr
-
-        mov             rdx,        0xf
-        and             rdx,        rdi
-
-        jmp .vpx_sad16x8x3_ssse3_skiptable
-.vpx_sad16x8x3_ssse3_jumptable:
-        dd .vpx_sad16x8x3_ssse3_aligned_by_0  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_1  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_2  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_3  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_4  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_5  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_6  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_7  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_8  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_9  - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_10 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_11 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_12 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_13 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_14 - .vpx_sad16x8x3_ssse3_do_jump
-        dd .vpx_sad16x8x3_ssse3_aligned_by_15 - .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_skiptable:
-
-        call .vpx_sad16x8x3_ssse3_do_jump
-.vpx_sad16x8x3_ssse3_do_jump:
-        pop             rcx                         ; get the address of do_jump
-        mov             rax,  .vpx_sad16x8x3_ssse3_jumptable - .vpx_sad16x8x3_ssse3_do_jump
-        add             rax,  rcx  ; get the absolute address of vpx_sad16x8x3_ssse3_jumptable
-
-        movsxd          rax,  dword [rax + 4*rdx]   ; get the 32 bit offset from the jumptable
-        add             rcx,        rax
-
-        movsxd          rax,        dword ptr arg(1) ;src_stride
-        movsxd          rdx,        dword ptr arg(3) ;ref_stride
-
-        jmp             rcx
-
-        PROCESS_16X8X3_OFFSET 0,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 1,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 2,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 3,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 4,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 5,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 6,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 7,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 8,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 9,  .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 10, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 11, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 12, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 13, .vpx_sad16x8x3_ssse3
-        PROCESS_16X8X3_OFFSET 14, .vpx_sad16x8x3_ssse3
-
-.vpx_sad16x8x3_ssse3_aligned_by_15:
-
-        PROCESS_16X2X3 1
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-        PROCESS_16X2X3 0
-
-.vpx_sad16x8x3_ssse3_store_off:
-        mov             rdi,        arg(4) ;Results
-
-        movq            xmm0,       xmm5
-        psrldq          xmm5,       8
-
-        paddw           xmm0,       xmm5
-        movd            [rdi],      xmm0
-;-
-        movq            xmm0,       xmm6
-        psrldq          xmm6,       8
-
-        paddw           xmm0,       xmm6
-        movd            [rdi+4],    xmm0
-;-
-        movq            xmm0,       xmm7
-        psrldq          xmm7,       8
-
-        paddw           xmm0,       xmm7
-        movd            [rdi+8],    xmm0
-
-    ; begin epilog
-    pop         rcx
-    pop         rdi
-    pop         rsi
-    RESTORE_XMM
-    UNSHADOW_ARGS
-    pop         rbp
-    ret
diff --git a/libvpx/vpx_dsp/x86/variance_sse2.c b/libvpx/vpx_dsp/x86/variance_sse2.c
index 37ef64e..a67c92a 100644
--- a/libvpx/vpx_dsp/x86/variance_sse2.c
+++ b/libvpx/vpx_dsp/x86/variance_sse2.c
@@ -36,8 +36,8 @@
 }
 
 static INLINE __m128i load4x2_sse2(const uint8_t *const p, const int stride) {
-  const __m128i p0 = _mm_cvtsi32_si128(loadu_uint32(p + 0 * stride));
-  const __m128i p1 = _mm_cvtsi32_si128(loadu_uint32(p + 1 * stride));
+  const __m128i p0 = _mm_cvtsi32_si128(loadu_int32(p + 0 * stride));
+  const __m128i p1 = _mm_cvtsi32_si128(loadu_int32(p + 1 * stride));
   const __m128i p01 = _mm_unpacklo_epi32(p0, p1);
   return _mm_unpacklo_epi8(p01, _mm_setzero_si128());
 }
@@ -471,23 +471,23 @@
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t));   \
-  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t));    \
-  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t));     \
-  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t));     \
-  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t));     \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (int32_t), (int32_t))   \
+  FN(8, 16, 8, 3, 4, opt1, (int32_t), (int32_t))    \
+  FN(8, 8, 8, 3, 3, opt1, (int32_t), (int32_t))     \
+  FN(8, 4, 8, 3, 2, opt1, (int32_t), (int32_t))     \
+  FN(4, 8, 4, 2, 3, opt1, (int32_t), (int32_t))     \
   FN(4, 4, 4, 2, 2, opt1, (int32_t), (int32_t))
 
-FNS(sse2, sse2);
-FNS(ssse3, ssse3);
+FNS(sse2, sse2)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
@@ -543,23 +543,23 @@
            (unsigned int)(cast_prod(cast se * se) >> (wlog2 + hlog2));    \
   }
 
-#define FNS(opt1, opt2)                              \
-  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t));  \
-  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t));  \
-  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t));  \
-  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t));  \
-  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t));  \
-  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)); \
-  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t));  \
-  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t));   \
-  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t));    \
-  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t));    \
-  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t));    \
+#define FNS(opt1, opt2)                             \
+  FN(64, 64, 16, 6, 6, opt1, (int64_t), (int64_t))  \
+  FN(64, 32, 16, 6, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 64, 16, 5, 6, opt1, (int64_t), (int64_t))  \
+  FN(32, 32, 16, 5, 5, opt1, (int64_t), (int64_t))  \
+  FN(32, 16, 16, 5, 4, opt1, (int64_t), (int64_t))  \
+  FN(16, 32, 16, 4, 5, opt1, (int64_t), (int64_t))  \
+  FN(16, 16, 16, 4, 4, opt1, (uint32_t), (int64_t)) \
+  FN(16, 8, 16, 4, 3, opt1, (uint32_t), (int32_t))  \
+  FN(8, 16, 8, 3, 4, opt1, (uint32_t), (int32_t))   \
+  FN(8, 8, 8, 3, 3, opt1, (uint32_t), (int32_t))    \
+  FN(8, 4, 8, 3, 2, opt1, (uint32_t), (int32_t))    \
+  FN(4, 8, 4, 2, 3, opt1, (uint32_t), (int32_t))    \
   FN(4, 4, 4, 2, 2, opt1, (uint32_t), (int32_t))
 
-FNS(sse2, sse);
-FNS(ssse3, ssse3);
+FNS(sse2, sse)
+FNS(ssse3, ssse3)
 
 #undef FNS
 #undef FN
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
index 2391790..0cbd151 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_4t_intrin_sse2.c
@@ -1040,12 +1040,12 @@
 //                                  const InterpKernel *filter, int x0_q4,
 //                                  int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                                  int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - (num_taps / 2 - 1) * src_stride, ,
-            sse2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+            sse2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1);
+            src - (num_taps / 2 - 1) * src_stride, avg_, sse2, 1)
 
 // void vpx_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                         uint8_t *dst, ptrdiff_t dst_stride,
@@ -1057,8 +1057,8 @@
 //                             const InterpKernel *filter, int x0_q4,
 //                             int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                             int w, int h);
-FUN_CONV_2D(, sse2, 0);
-FUN_CONV_2D(avg_, sse2, 1);
+FUN_CONV_2D(, sse2, 0)
+FUN_CONV_2D(avg_, sse2, 1)
 
 #if CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
 // From vpx_dsp/x86/vpx_high_subpixel_8t_sse2.asm.
@@ -1139,12 +1139,12 @@
 //                                         const int16_t *filter_y,
 //                                         int y_step_q4,
 //                                         int w, int h, int bd);
-HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0);
+HIGH_FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , sse2, 0)
 HIGH_FUN_CONV_1D(vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), , sse2, 0);
-HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), , sse2, 0)
+HIGH_FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, sse2, 1)
 HIGH_FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1);
+                 src - src_stride * (num_taps / 2 - 1), avg_, sse2, 1)
 
 // void vpx_highbd_convolve8_sse2(const uint8_t *src, ptrdiff_t src_stride,
 //                                uint8_t *dst, ptrdiff_t dst_stride,
@@ -1156,6 +1156,6 @@
 //                                    const InterpKernel *filter, int x0_q4,
 //                                    int32_t x_step_q4, int y0_q4,
 //                                    int y_step_q4, int w, int h, int bd);
-HIGH_FUN_CONV_2D(, sse2, 0);
-HIGH_FUN_CONV_2D(avg_, sse2, 1);
+HIGH_FUN_CONV_2D(, sse2, 0)
+HIGH_FUN_CONV_2D(avg_, sse2, 1)
 #endif  // CONFIG_VP9_HIGHBITDEPTH && VPX_ARCH_X86_64
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
index 1eaa19b..6f2983a 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_avx2.c
@@ -969,12 +969,12 @@
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , avx2, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            avx2, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1);
+            avx2, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, avx2, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, avx2, 1)
 
 // void vpx_convolve8_avx2(const uint8_t *src, ptrdiff_t src_stride,
 //                          uint8_t *dst, ptrdiff_t dst_stride,
@@ -986,6 +986,6 @@
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, avx2, 0);
-FUN_CONV_2D(avg_, avx2, 1);
+FUN_CONV_2D(, avx2, 0)
+FUN_CONV_2D(avg_, avx2, 1)
 #endif  // HAVE_AX2 && HAVE_SSSE3
diff --git a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
index 77355a2..ed46d62 100644
--- a/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
+++ b/libvpx/vpx_dsp/x86/vpx_subpixel_8t_intrin_ssse3.c
@@ -731,12 +731,12 @@
 //                                   const InterpKernel *filter, int x0_q4,
 //                                   int32_t x_step_q4, int y0_q4,
 //                                   int y_step_q4, int w, int h);
-FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0);
+FUN_CONV_1D(horiz, x0_q4, x_step_q4, h, src, , ssse3, 0)
 FUN_CONV_1D(vert, y0_q4, y_step_q4, v, src - src_stride * (num_taps / 2 - 1), ,
-            ssse3, 0);
-FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1);
+            ssse3, 0)
+FUN_CONV_1D(avg_horiz, x0_q4, x_step_q4, h, src, avg_, ssse3, 1)
 FUN_CONV_1D(avg_vert, y0_q4, y_step_q4, v,
-            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1);
+            src - src_stride * (num_taps / 2 - 1), avg_, ssse3, 1)
 
 static void filter_horiz_w8_ssse3(const uint8_t *const src,
                                   const ptrdiff_t src_stride,
@@ -1083,5 +1083,5 @@
 //                              const InterpKernel *filter, int x0_q4,
 //                              int32_t x_step_q4, int y0_q4, int y_step_q4,
 //                              int w, int h);
-FUN_CONV_2D(, ssse3, 0);
-FUN_CONV_2D(avg_, ssse3, 1);
+FUN_CONV_2D(, ssse3, 0)
+FUN_CONV_2D(avg_, ssse3, 1)
diff --git a/libvpx/vpx_ports/bitops.h b/libvpx/vpx_ports/bitops.h
index 5b2f31c..1b5cdaa 100644
--- a/libvpx/vpx_ports/bitops.h
+++ b/libvpx/vpx_ports/bitops.h
@@ -26,20 +26,32 @@
 extern "C" {
 #endif
 
-// These versions of get_msb() are only valid when n != 0 because all
-// of the optimized versions are undefined when n == 0:
+// These versions of get_lsb() and get_msb() are only valid when n != 0
+// because all of the optimized versions are undefined when n == 0:
 // https://gcc.gnu.org/onlinedocs/gcc/Other-Builtins.html
 
 // use GNU builtins where available.
 #if defined(__GNUC__) && \
     ((__GNUC__ == 3 && __GNUC_MINOR__ >= 4) || __GNUC__ >= 4)
+static INLINE int get_lsb(unsigned int n) {
+  assert(n != 0);
+  return __builtin_ctz(n);
+}
+
 static INLINE int get_msb(unsigned int n) {
   assert(n != 0);
   return 31 ^ __builtin_clz(n);
 }
 #elif defined(USE_MSC_INTRINSICS)
+#pragma intrinsic(_BitScanForward)
 #pragma intrinsic(_BitScanReverse)
 
+static INLINE int get_lsb(unsigned int n) {
+  unsigned long first_set_bit;  // NOLINT(runtime/int)
+  _BitScanForward(&first_set_bit, n);
+  return first_set_bit;
+}
+
 static INLINE int get_msb(unsigned int n) {
   unsigned long first_set_bit;
   assert(n != 0);
@@ -48,6 +60,13 @@
 }
 #undef USE_MSC_INTRINSICS
 #else
+static INLINE int get_lsb(unsigned int n) {
+  int i;
+  assert(n != 0);
+  for (i = 0; i < 32 && !(n & 1); ++i) n >>= 1;
+  return i;
+}
+
 // Returns (int)floor(log2(n)). n must be > 0.
 static INLINE int get_msb(unsigned int n) {
   int log = 0;
diff --git a/libvpx/vpx_ports/loongarch.h b/libvpx/vpx_ports/loongarch.h
new file mode 100644
index 0000000..d93ff9f
--- /dev/null
+++ b/libvpx/vpx_ports/loongarch.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#ifndef VPX_VPX_PORTS_LOONGARCH_H_
+#define VPX_VPX_PORTS_LOONGARCH_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define HAS_LSX 0x01
+#define HAS_LASX 0x02
+
+int loongarch_cpu_caps(void);
+
+#ifdef __cplusplus
+}  // extern "C"
+#endif
+
+#endif  // VPX_VPX_PORTS_LOONGARCH_H_
diff --git a/libvpx/vpx_ports/loongarch_cpudetect.c b/libvpx/vpx_ports/loongarch_cpudetect.c
new file mode 100644
index 0000000..7b4322d
--- /dev/null
+++ b/libvpx/vpx_ports/loongarch_cpudetect.c
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * Contributed by Jin Bo  <jinbo@loongson.cn>
+ * Contributed by Lu Wang <wanglu@loongson.cn>
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ */
+
+#include "./vpx_config.h"
+#include "vpx_ports/loongarch.h"
+
+#define LOONGARCH_CFG2 0x02
+#define LOONGARCH_CFG2_LSX (1 << 6)
+#define LOONGARCH_CFG2_LASX (1 << 7)
+
+#if CONFIG_RUNTIME_CPU_DETECT
+#if defined(__loongarch__) && defined(__linux__)
+int loongarch_cpu_caps(void) {
+  int reg = 0;
+  int flag = 0;
+
+  __asm__ volatile("cpucfg %0, %1 \n\t" : "+&r"(reg) : "r"(LOONGARCH_CFG2));
+  if (reg & LOONGARCH_CFG2_LSX) flag |= HAS_LSX;
+
+  if (reg & LOONGARCH_CFG2_LASX) flag |= HAS_LASX;
+
+  return flag;
+}
+#else /* end __loongarch__ && __linux__ */
+#error \
+    "--enable-runtime-cpu-detect selected, but no CPU detection method " \
+"available for your platform. Reconfigure with --disable-runtime-cpu-detect."
+#endif
+#else /* end CONFIG_RUNTIME_CPU_DETECT */
+int loongarch_cpu_caps(void) { return 0; }
+#endif
diff --git a/libvpx/vpx_ports/mips.h b/libvpx/vpx_ports/mips.h
index bdc7525..439de75 100644
--- a/libvpx/vpx_ports/mips.h
+++ b/libvpx/vpx_ports/mips.h
@@ -8,8 +8,8 @@
  *  be found in the AUTHORS file in the root of the source tree.
  */
 
-#ifndef VPX_PORTS_MIPS_H_
-#define VPX_PORTS_MIPS_H_
+#ifndef VPX_VPX_PORTS_MIPS_H_
+#define VPX_VPX_PORTS_MIPS_H_
 
 #ifdef __cplusplus
 extern "C" {
@@ -24,4 +24,4 @@
 }  // extern "C"
 #endif
 
-#endif  // VPX_PORTS_MIPS_H_
+#endif  // VPX_VPX_PORTS_MIPS_H_
diff --git a/libvpx/vpx_ports/vpx_once.h b/libvpx/vpx_ports/vpx_once.h
index 4eb592b..d8a8ed8 100644
--- a/libvpx/vpx_ports/vpx_once.h
+++ b/libvpx/vpx_ports/vpx_once.h
@@ -95,7 +95,7 @@
 #define INCL_DOS
 #include <os2.h>
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   /* If the initialization is complete, return early. */
   if (done) return;
@@ -128,7 +128,7 @@
  */
 
 static void once(void (*func)(void)) {
-  static int done;
+  static volatile int done;
 
   if (!done) {
     func();
diff --git a/libvpx/vpx_ports/vpx_ports.mk b/libvpx/vpx_ports/vpx_ports.mk
index e5001be..e30e87c 100644
--- a/libvpx/vpx_ports/vpx_ports.mk
+++ b/libvpx/vpx_ports/vpx_ports.mk
@@ -45,6 +45,9 @@
 PORTS_SRCS-$(VPX_ARCH_MIPS) += mips_cpudetect.c
 PORTS_SRCS-$(VPX_ARCH_MIPS) += mips.h
 
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch_cpudetect.c
+PORTS_SRCS-$(VPX_ARCH_LOONGARCH) += loongarch.h
+
 ifeq ($(VPX_ARCH_MIPS), yes)
 PORTS_SRCS-yes += asmdefs_mmi.h
 endif
diff --git a/libvpx/vpx_ports/x86.h b/libvpx/vpx_ports/x86.h
index 4d5391b..795fb29 100644
--- a/libvpx/vpx_ports/x86.h
+++ b/libvpx/vpx_ports/x86.h
@@ -47,7 +47,7 @@
 #define cpuid(func, func2, ax, bx, cx, dx)                      \
   __asm__ __volatile__("cpuid           \n\t"                   \
                        : "=a"(ax), "=b"(bx), "=c"(cx), "=d"(dx) \
-                       : "a"(func), "c"(func2));
+                       : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   __asm__ __volatile__(                        \
@@ -55,7 +55,7 @@
       "cpuid              \n\t"                \
       "xchg %%edi, %%ebx  \n\t"                \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #elif defined(__SUNPRO_C) || \
     defined(__SUNPRO_CC) /* end __GNUC__ or __ANDROID__*/
@@ -67,7 +67,7 @@
       "movl %ebx, %edi \n\t"                   \
       "xchg %rsi, %rbx \n\t"                   \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #else
 #define cpuid(func, func2, ax, bx, cx, dx)     \
   asm volatile(                                \
@@ -76,7 +76,7 @@
       "movl %ebx, %edi  \n\t"                  \
       "popl %ebx        \n\t"                  \
       : "=a"(ax), "=D"(bx), "=c"(cx), "=d"(dx) \
-      : "a"(func), "c"(func2));
+      : "a"(func), "c"(func2))
 #endif
 #else /* end __SUNPRO__ */
 #if VPX_ARCH_X86_64
@@ -391,7 +391,7 @@
   // Reserved                      01B
   // Double Precision (53-Bits)    10B
   // Extended Precision (64-Bits)  11B
-  x87_set_control_word((mode & ~0x300) | 0x200);
+  x87_set_control_word((mode & ~0x300u) | 0x200u);
   return mode;
 }
 
diff --git a/libvpx/vpx_scale/generic/yv12config.c b/libvpx/vpx_scale/generic/yv12config.c
index eee291c..c52dab0 100644
--- a/libvpx/vpx_scale/generic/yv12config.c
+++ b/libvpx/vpx_scale/generic/yv12config.c
@@ -64,6 +64,10 @@
 
     if (!ybf->buffer_alloc) {
       ybf->buffer_alloc = (uint8_t *)vpx_memalign(32, frame_size);
+      if (!ybf->buffer_alloc) {
+        ybf->buffer_alloc_sz = 0;
+        return -1;
+      }
 #if defined(__has_feature)
 #if __has_feature(memory_sanitizer)
       // This memset is needed for fixing the issue of using uninitialized
@@ -75,7 +79,7 @@
       ybf->buffer_alloc_sz = frame_size;
     }
 
-    if (!ybf->buffer_alloc || ybf->buffer_alloc_sz < frame_size) return -1;
+    if (ybf->buffer_alloc_sz < frame_size) return -1;
 
     /* Only support allocating buffers that have a border that's a multiple
      * of 32. The border restriction is required to get 16-byte alignment of
diff --git a/libvpx/vpx_util/loongson_intrinsics.h b/libvpx/vpx_util/loongson_intrinsics.h
new file mode 100644
index 0000000..b8b9e6d
--- /dev/null
+++ b/libvpx/vpx_util/loongson_intrinsics.h
@@ -0,0 +1,2090 @@
+/*
+ * Copyright (c) 2022 The WebM project authors. All Rights Reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ */
+
+#ifndef VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+#define VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_
+
+/*
+ * Copyright (c) 2021 Loongson Technology Corporation Limited
+ * All rights reserved.
+ *
+ * Use of this source code is governed by a BSD-style license
+ * that can be found in the LICENSE file in the root of the source
+ * tree. An additional intellectual property rights grant can be found
+ * in the file PATENTS.  All contributing project authors may
+ * be found in the AUTHORS file in the root of the source tree.
+ *
+ * Contributed by Shiyou Yin <yinshiyou-hf@loongson.cn>
+ *                Xiwei Gu   <guxiwei-hf@loongson.cn>
+ *                Lu Wang    <wanglu@loongson.cn>
+ *
+ * This file is a header file for loongarch builtin extension.
+ *
+ */
+
+#ifndef LOONGSON_INTRINSICS_H
+#define LOONGSON_INTRINSICS_H
+
+/**
+ * MAJOR version: Macro usage changes.
+ * MINOR version: Add new functions, or bug fixes.
+ * MICRO version: Comment changes or implementation changes.
+ */
+#define LSOM_VERSION_MAJOR 1
+#define LSOM_VERSION_MINOR 2
+#define LSOM_VERSION_MICRO 1
+
+#define DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1) \
+  {                                               \
+    _OUT0 = _INS(_IN0);                           \
+    _OUT1 = _INS(_IN1);                           \
+  }
+
+#define DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1) \
+  {                                                           \
+    _OUT0 = _INS(_IN0, _IN1);                                 \
+    _OUT1 = _INS(_IN2, _IN3);                                 \
+  }
+
+#define DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1) \
+  {                                                                       \
+    _OUT0 = _INS(_IN0, _IN1, _IN2);                                       \
+    _OUT1 = _INS(_IN3, _IN4, _IN5);                                       \
+  }
+
+#define DUP4_ARG1(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1, _OUT2, _OUT3) \
+  {                                                                         \
+    DUP2_ARG1(_INS, _IN0, _IN1, _OUT0, _OUT1);                              \
+    DUP2_ARG1(_INS, _IN2, _IN3, _OUT2, _OUT3);                              \
+  }
+
+#define DUP4_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _OUT0, \
+                  _OUT1, _OUT2, _OUT3)                                         \
+  {                                                                            \
+    DUP2_ARG2(_INS, _IN0, _IN1, _IN2, _IN3, _OUT0, _OUT1);                     \
+    DUP2_ARG2(_INS, _IN4, _IN5, _IN6, _IN7, _OUT2, _OUT3);                     \
+  }
+
+#define DUP4_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _IN6, _IN7, _IN8, \
+                  _IN9, _IN10, _IN11, _OUT0, _OUT1, _OUT2, _OUT3)             \
+  {                                                                           \
+    DUP2_ARG3(_INS, _IN0, _IN1, _IN2, _IN3, _IN4, _IN5, _OUT0, _OUT1);        \
+    DUP2_ARG3(_INS, _IN6, _IN7, _IN8, _IN9, _IN10, _IN11, _OUT2, _OUT3);      \
+  }
+
+#ifdef __loongarch_sx
+#include <lsxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input. Then
+ *               the results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_b(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu(__m128i in_c, __m128i in_h,
+                                         __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               The results are added to signed half-word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_bu_b(in_c, in_h, in_l)
+ *        in_c : 1,1,1,1, 1,1,1,1
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : -1,-2,-3,-4, -5,-6,-7,-8, 1,2,3,4, 5,6,7,8
+ *         out : -4,-24,-60,-112, 6,26,62,114
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_h_bu_b(__m128i in_c, __m128i in_h,
+                                           __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of half-word vector elements
+ * Arguments   : Inputs  - in_c, in_h, in_l
+ *               Outputs - out
+ *               Return Type - __m128i
+ * Details     : Signed half-word elements from in_h are multiplied by
+ *               signed half-word elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ *               Then the results are added to signed word elements from in_c.
+ * Example     : out = __lsx_vdp2add_h_b(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 23,40,41,26
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2add_w_h(__m128i in_c, __m128i in_h,
+                                        __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmaddwev_w_h(in_c, in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               unsigned byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_h_bu_b(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,-1
+ *         out : 22,38,38,22, 22,38,38,6
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_h_bu_b(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_h_bu_b(in_h, in_l);
+  out = __lsx_vmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_w_h(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_w_h(in_h, in_l);
+  out = __lsx_vmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs  - in_h, in_l
+ *               Outputs - out
+ *               Return Type - double
+ * Details     : Signed byte elements from in_h are multiplied by
+ *               signed byte elements from in_l, and then added adjacent to
+ *               each other to get a result twice the size of input.
+ * Example     : out = __lsx_vdp2_d_w(in_h, in_l)
+ *        in_h : 1,2,3,4
+ *        in_l : 8,7,6,5
+ *         out : 22,38
+ * =============================================================================
+ */
+static inline __m128i __lsx_vdp2_d_w(__m128i in_h, __m128i in_l) {
+  __m128i out;
+
+  out = __lsx_vmulwev_d_w(in_h, in_l);
+  out = __lsx_vmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((_in) < (min)) ? (min) : (((_in) > (max)) ? (max) :
+ *               (_in))
+ * Arguments   : Inputs  - _in  (input vector)
+ *                       - min  (min threshold)
+ *                       - max  (max threshold)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lsx_vclip_h(_in)
+ *         _in : -8,2,280,249, -8,255,280,249
+ *         min : 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip_h(__m128i _in, __m128i min, __m128i max) {
+  __m128i out;
+
+  out = __lsx_vmax_h(min, _in);
+  out = __lsx_vmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_h(_in)
+ *         _in : -8,255,280,249, -8,255,280,249
+ *         out : 0,255,255,249, 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_h(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_h(_in, 0);
+  out = __lsx_vsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Set each element of vector between 0 and 255
+ * Arguments   : Inputs  - _in
+ *               Outputs - out
+ *               Return Type - word
+ * Details     : Signed byte elements from _in are clamped between 0 and 255.
+ * Example     : out = __lsx_vclip255_w(_in)
+ *         _in : -8,255,280,249
+ *         out : 0,255,255,249
+ * =============================================================================
+ */
+static inline __m128i __lsx_vclip255_w(__m128i _in) {
+  __m128i out;
+
+  out = __lsx_vmaxi_w(_in, 0);
+  out = __lsx_vsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Swap two variables
+ * Arguments   : Inputs  - _in0, _in1
+ *               Outputs - _in0, _in1 (in-place)
+ * Details     : Swapping of two input variables using xor
+ * Example     : LSX_SWAP(_in0, _in1)
+ *        _in0 : 1,2,3,4
+ *        _in1 : 5,6,7,8
+ *   _in0(out) : 5,6,7,8
+ *   _in1(out) : 1,2,3,4
+ * =============================================================================
+ */
+#define LSX_SWAP(_in0, _in1)         \
+  {                                  \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+    _in1 = __lsx_vxor_v(_in0, _in1); \
+    _in0 = __lsx_vxor_v(_in0, _in1); \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     :
+ * Example     :
+ *               1, 2, 3, 4            1, 5, 9,13
+ *               5, 6, 7, 8    to      2, 6,10,14
+ *               9,10,11,12  =====>    3, 7,11,15
+ *              13,14,15,16            4, 8,12,16
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE4x4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    __m128i _t0, _t1, _t2, _t3;                                                \
+                                                                               \
+    _t0 = __lsx_vilvl_w(_in1, _in0);                                           \
+    _t1 = __lsx_vilvh_w(_in1, _in0);                                           \
+    _t2 = __lsx_vilvl_w(_in3, _in2);                                           \
+    _t3 = __lsx_vilvh_w(_in3, _in2);                                           \
+    _out0 = __lsx_vilvl_d(_t2, _t0);                                           \
+    _out1 = __lsx_vilvh_d(_t2, _t0);                                           \
+    _out2 = __lsx_vilvl_d(_t3, _t1);                                           \
+    _out3 = __lsx_vilvh_d(_t3, _t1);                                           \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with byte elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Details     : The rows of the matrix become columns, and the columns
+ *               become rows.
+ * Example     : LSX_TRANSPOSE8x8_B
+ *        _in0 : 00,01,02,03,04,05,06,07, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,14,15,16,17, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,24,25,26,27, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,34,35,36,37, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,44,45,46,47, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,54,55,56,57, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,64,65,66,67, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,74,75,76,77, 00,00,00,00,00,00,00,00
+ *
+ *      _ out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *      _ out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *      _ out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *      _ out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ *      _ out4 : 04,14,24,34,44,54,64,74, 00,00,00,00,00,00,00,00
+ *      _ out5 : 05,15,25,35,45,55,65,75, 00,00,00,00,00,00,00,00
+ *      _ out6 : 06,16,26,36,46,56,66,76, 00,00,00,00,00,00,00,00
+ *      _ out7 : 07,17,27,37,47,57,67,77, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i zero = { 0 };                                                   \
+    __m128i shuf8 = { 0x0F0E0D0C0B0A0908, 0x1716151413121110 };             \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                         \
+                                                                            \
+    _t0 = __lsx_vilvl_b(_in2, _in0);                                        \
+    _t1 = __lsx_vilvl_b(_in3, _in1);                                        \
+    _t2 = __lsx_vilvl_b(_in6, _in4);                                        \
+    _t3 = __lsx_vilvl_b(_in7, _in5);                                        \
+    _t4 = __lsx_vilvl_b(_t1, _t0);                                          \
+    _t5 = __lsx_vilvh_b(_t1, _t0);                                          \
+    _t6 = __lsx_vilvl_b(_t3, _t2);                                          \
+    _t7 = __lsx_vilvh_b(_t3, _t2);                                          \
+    _out0 = __lsx_vilvl_w(_t6, _t4);                                        \
+    _out2 = __lsx_vilvh_w(_t6, _t4);                                        \
+    _out4 = __lsx_vilvl_w(_t7, _t5);                                        \
+    _out6 = __lsx_vilvh_w(_t7, _t5);                                        \
+    _out1 = __lsx_vshuf_b(zero, _out0, shuf8);                              \
+    _out3 = __lsx_vshuf_b(zero, _out2, shuf8);                              \
+    _out5 = __lsx_vshuf_b(zero, _out4, shuf8);                              \
+    _out7 = __lsx_vshuf_b(zero, _out6, shuf8);                              \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with half-word elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              00,01,02,03,04,05,06,07           00,10,20,30,40,50,60,70
+ *              10,11,12,13,14,15,16,17           01,11,21,31,41,51,61,71
+ *              20,21,22,23,24,25,26,27           02,12,22,32,42,52,62,72
+ *              30,31,32,33,34,35,36,37    to     03,13,23,33,43,53,63,73
+ *              40,41,42,43,44,45,46,47  ======>  04,14,24,34,44,54,64,74
+ *              50,51,52,53,54,55,56,57           05,15,25,35,45,55,65,75
+ *              60,61,62,63,64,65,66,67           06,16,26,36,46,56,66,76
+ *              70,71,72,73,74,75,76,77           07,17,27,37,47,57,67,77
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    __m128i _s0, _s1, _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;               \
+                                                                            \
+    _s0 = __lsx_vilvl_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvl_h(_in7, _in5);                                        \
+    _t0 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t1 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in6, _in4);                                        \
+    _s1 = __lsx_vilvh_h(_in7, _in5);                                        \
+    _t2 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t3 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvl_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvl_h(_in3, _in1);                                        \
+    _t4 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t5 = __lsx_vilvh_h(_s1, _s0);                                          \
+    _s0 = __lsx_vilvh_h(_in2, _in0);                                        \
+    _s1 = __lsx_vilvh_h(_in3, _in1);                                        \
+    _t6 = __lsx_vilvl_h(_s1, _s0);                                          \
+    _t7 = __lsx_vilvh_h(_s1, _s0);                                          \
+                                                                            \
+    _out0 = __lsx_vpickev_d(_t0, _t4);                                      \
+    _out2 = __lsx_vpickev_d(_t1, _t5);                                      \
+    _out4 = __lsx_vpickev_d(_t2, _t6);                                      \
+    _out6 = __lsx_vpickev_d(_t3, _t7);                                      \
+    _out1 = __lsx_vpickod_d(_t0, _t4);                                      \
+    _out3 = __lsx_vpickod_d(_t1, _t5);                                      \
+    _out5 = __lsx_vpickod_d(_t2, _t6);                                      \
+    _out7 = __lsx_vpickod_d(_t3, _t7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x4 byte block into 4x8
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3      (input 8x4 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3  (output 4x8 byte block)
+ *               Return Type - as per RTYPE
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LSX_TRANSPOSE8x4_B
+ *        _in0 : 00,01,02,03,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in1 : 10,11,12,13,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in2 : 20,21,22,23,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in3 : 30,31,32,33,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in4 : 40,41,42,43,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in5 : 50,51,52,53,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in6 : 60,61,62,63,00,00,00,00, 00,00,00,00,00,00,00,00
+ *        _in7 : 70,71,72,73,00,00,00,00, 00,00,00,00,00,00,00,00
+ *
+ *       _out0 : 00,10,20,30,40,50,60,70, 00,00,00,00,00,00,00,00
+ *       _out1 : 01,11,21,31,41,51,61,71, 00,00,00,00,00,00,00,00
+ *       _out2 : 02,12,22,32,42,52,62,72, 00,00,00,00,00,00,00,00
+ *       _out3 : 03,13,23,33,43,53,63,73, 00,00,00,00,00,00,00,00
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE8x4_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7, \
+                           _out0, _out1, _out2, _out3)                     \
+  {                                                                        \
+    __m128i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                            \
+                                                                           \
+    _tmp0_m = __lsx_vpackev_w(_in4, _in0);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in5, _in1);                                 \
+    _tmp2_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vpackev_w(_in6, _in2);                                 \
+    _tmp1_m = __lsx_vpackev_w(_in7, _in3);                                 \
+                                                                           \
+    _tmp3_m = __lsx_vilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp0_m = __lsx_vilvl_h(_tmp3_m, _tmp2_m);                             \
+    _tmp1_m = __lsx_vilvh_h(_tmp3_m, _tmp2_m);                             \
+                                                                           \
+    _out0 = __lsx_vilvl_w(_tmp1_m, _tmp0_m);                               \
+    _out2 = __lsx_vilvh_w(_tmp1_m, _tmp0_m);                               \
+    _out1 = __lsx_vilvh_d(_out2, _out0);                                   \
+    _out3 = __lsx_vilvh_d(_out0, _out2);                                   \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 16x8 block with byte elements in vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3, in4, in5, in6, in7, in8
+ *                         in9, in10, in11, in12, in13, in14, in15
+ *               Outputs - out0, out1, out2, out3, out4, out5, out6, out7
+ * Details     :
+ * Example     :
+ *              000,001,002,003,004,005,006,007
+ *              008,009,010,011,012,013,014,015
+ *              016,017,018,019,020,021,022,023
+ *              024,025,026,027,028,029,030,031
+ *              032,033,034,035,036,037,038,039
+ *              040,041,042,043,044,045,046,047        000,008,...,112,120
+ *              048,049,050,051,052,053,054,055        001,009,...,113,121
+ *              056,057,058,059,060,061,062,063   to   002,010,...,114,122
+ *              064,068,066,067,068,069,070,071 =====> 003,011,...,115,123
+ *              072,073,074,075,076,077,078,079        004,012,...,116,124
+ *              080,081,082,083,084,085,086,087        005,013,...,117,125
+ *              088,089,090,091,092,093,094,095        006,014,...,118,126
+ *              096,097,098,099,100,101,102,103        007,015,...,119,127
+ *              104,105,106,107,108,109,110,111
+ *              112,113,114,115,116,117,118,119
+ *              120,121,122,123,124,125,126,127
+ * =============================================================================
+ */
+#define LSX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                            _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                            _out6, _out7)                                    \
+  {                                                                          \
+    __m128i _tmp0, _tmp1, _tmp2, _tmp3, _tmp4, _tmp5, _tmp6, _tmp7;          \
+    __m128i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                          \
+    DUP4_ARG2(__lsx_vilvl_b, _in2, _in0, _in3, _in1, _in6, _in4, _in7, _in5, \
+              _tmp0, _tmp1, _tmp2, _tmp3);                                   \
+    DUP4_ARG2(__lsx_vilvl_b, _in10, _in8, _in11, _in9, _in14, _in12, _in15,  \
+              _in13, _tmp4, _tmp5, _tmp6, _tmp7);                            \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp1, _tmp0, _tmp3, _tmp2, _t0, _t2);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp1, _tmp0, _tmp3, _tmp2, _t1, _t3);          \
+    DUP2_ARG2(__lsx_vilvl_b, _tmp5, _tmp4, _tmp7, _tmp6, _t4, _t6);          \
+    DUP2_ARG2(__lsx_vilvh_b, _tmp5, _tmp4, _tmp7, _tmp6, _t5, _t7);          \
+    DUP2_ARG2(__lsx_vilvl_w, _t2, _t0, _t3, _t1, _tmp0, _tmp4);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t2, _t0, _t3, _t1, _tmp2, _tmp6);              \
+    DUP2_ARG2(__lsx_vilvl_w, _t6, _t4, _t7, _t5, _tmp1, _tmp5);              \
+    DUP2_ARG2(__lsx_vilvh_w, _t6, _t4, _t7, _t5, _tmp3, _tmp7);              \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp1, _tmp0, _tmp3, _tmp2, _out0, _out2);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp1, _tmp0, _tmp3, _tmp2, _out1, _out3);      \
+    DUP2_ARG2(__lsx_vilvl_d, _tmp5, _tmp4, _tmp7, _tmp6, _out4, _out6);      \
+    DUP2_ARG2(__lsx_vilvh_d, _tmp5, _tmp4, _tmp7, _tmp6, _out5, _out7);      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - in0, in1, in2, in3
+ *               Outputs - out0, out1, out2, out3
+ * Details     : Butterfly operation
+ * Example     :
+ *               out0 = in0 + in3;
+ *               out1 = in1 + in2;
+ *               out2 = in1 - in2;
+ *               out3 = in0 - in3;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_b(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_b(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_b(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_h(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_h(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_h(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_w(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_w(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_w(_in0, _in3);                                         \
+  }
+#define LSX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                           \
+    _out0 = __lsx_vadd_d(_in0, _in3);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in2);                                         \
+    _out2 = __lsx_vsub_d(_in1, _in2);                                         \
+    _out3 = __lsx_vsub_d(_in0, _in3);                                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in7;
+ *              _out1 = _in1 + _in6;
+ *              _out2 = _in2 + _in5;
+ *              _out3 = _in3 + _in4;
+ *              _out4 = _in3 - _in4;
+ *              _out5 = _in2 - _in5;
+ *              _out6 = _in1 - _in6;
+ *              _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LSX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_b(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_b(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_b(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_b(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_b(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_b(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_b(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_b(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_h(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_h(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_h(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_h(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_h(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_h(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_h(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_h(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_w(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_w(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_w(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_w(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_w(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_w(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_w(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_w(_in0, _in7);                                      \
+  }
+
+#define LSX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                          _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                          _out7)                                           \
+  {                                                                        \
+    _out0 = __lsx_vadd_d(_in0, _in7);                                      \
+    _out1 = __lsx_vadd_d(_in1, _in6);                                      \
+    _out2 = __lsx_vadd_d(_in2, _in5);                                      \
+    _out3 = __lsx_vadd_d(_in3, _in4);                                      \
+    _out4 = __lsx_vsub_d(_in3, _in4);                                      \
+    _out5 = __lsx_vsub_d(_in2, _in5);                                      \
+    _out6 = __lsx_vsub_d(_in1, _in6);                                      \
+    _out7 = __lsx_vsub_d(_in0, _in7);                                      \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 16 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     :
+ *              _out0 = _in0 + _in15;
+ *              _out1 = _in1 + _in14;
+ *              _out2 = _in2 + _in13;
+ *              _out3 = _in3 + _in12;
+ *              _out4 = _in4 + _in11;
+ *              _out5 = _in5 + _in10;
+ *              _out6 = _in6 + _in9;
+ *              _out7 = _in7 + _in8;
+ *              _out8 = _in7 - _in8;
+ *              _out9 = _in6 - _in9;
+ *              _out10 = _in5 - _in10;
+ *              _out11 = _in4 - _in11;
+ *              _out12 = _in3 - _in12;
+ *              _out13 = _in2 - _in13;
+ *              _out14 = _in1 - _in14;
+ *              _out15 = _in0 - _in15;
+ * =============================================================================
+ */
+
+#define LSX_BUTTERFLY_16_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_b(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_b(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_b(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_b(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_b(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_b(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_b(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_b(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_b(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_b(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_b(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_b(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_b(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_b(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_b(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_b(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_h(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_h(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_h(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_h(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_h(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_h(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_h(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_h(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_h(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_h(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_h(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_h(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_h(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_h(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_h(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_h(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_w(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_w(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_w(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_w(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_w(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_w(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_w(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_w(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_w(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_w(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_w(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_w(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_w(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_w(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_w(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_w(_in0, _in15);                                        \
+  }
+
+#define LSX_BUTTERFLY_16_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,     \
+                           _in8, _in9, _in10, _in11, _in12, _in13, _in14,      \
+                           _in15, _out0, _out1, _out2, _out3, _out4, _out5,    \
+                           _out6, _out7, _out8, _out9, _out10, _out11, _out12, \
+                           _out13, _out14, _out15)                             \
+  {                                                                            \
+    _out0 = __lsx_vadd_d(_in0, _in15);                                         \
+    _out1 = __lsx_vadd_d(_in1, _in14);                                         \
+    _out2 = __lsx_vadd_d(_in2, _in13);                                         \
+    _out3 = __lsx_vadd_d(_in3, _in12);                                         \
+    _out4 = __lsx_vadd_d(_in4, _in11);                                         \
+    _out5 = __lsx_vadd_d(_in5, _in10);                                         \
+    _out6 = __lsx_vadd_d(_in6, _in9);                                          \
+    _out7 = __lsx_vadd_d(_in7, _in8);                                          \
+                                                                               \
+    _out8 = __lsx_vsub_d(_in7, _in8);                                          \
+    _out9 = __lsx_vsub_d(_in6, _in9);                                          \
+    _out10 = __lsx_vsub_d(_in5, _in10);                                        \
+    _out11 = __lsx_vsub_d(_in4, _in11);                                        \
+    _out12 = __lsx_vsub_d(_in3, _in12);                                        \
+    _out13 = __lsx_vsub_d(_in2, _in13);                                        \
+    _out14 = __lsx_vsub_d(_in1, _in14);                                        \
+    _out15 = __lsx_vsub_d(_in0, _in15);                                        \
+  }
+
+#endif  // LSX
+
+#ifdef __loongarch_asx
+#include <lasxintrin.h>
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplication results of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_b(in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : out = __lasx_xvdp2_w_h(in_h, in_l)
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1
+ *         out : 22,38,38,22, 22,38,38,22
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of word vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed double
+ * Details     : Signed word elements from in_h are multiplied with
+ *               signed word elements from in_l producing a result
+ *               twice the size of input i.e. signed double-word.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the out vector.
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_d_w(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_d_w(in_h, in_l);
+  out = __lasx_xvmaddwod_d_w(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. unsigned word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the out vector
+ * Example     : See out = __lasx_xvdp2_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2_w_hu_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_hu_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Signed byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_b(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product & addition of byte vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               signed byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Then these multiplied results of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_h_bu_b(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_h_bu_b(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu_b(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - per RTYPE
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 1,2,3,4
+ *        in_h : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8,
+ *        in_l : 8,7,6,5, 4,3,2,1, 8,7,6,5, 4,3,2,1,
+ *         out : 23,40,41,26, 23,40,41,26
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               unsigned halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector.
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Unsigned halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added to the in_c vector
+ * Example     : See out = __lasx_xvdp2add_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2add_w_hu_h(__m256i in_c, __m256i in_h,
+                                             __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmaddwev_w_hu_h(in_c, in_h, in_l);
+  out = __lasx_xvmaddwod_w_hu_h(out, in_h, in_l);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Unsigned Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed halfword
+ * Details     : Unsigned byte elements from in_h are multiplied with
+ *               unsigned byte elements from in_l producing a result
+ *               twice the size of input i.e. signed halfword.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : See out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_h_bu(__m256i in_c, __m256i in_h,
+                                           __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_h_bu(in_h, in_l);
+  out = __lasx_xvmaddwod_h_bu(out, in_h, in_l);
+  out = __lasx_xvsub_h(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Vector Signed Dot Product and Subtract
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               Signed halfword elements from in_l producing a result
+ *               twice the size of input i.e. signed word.
+ *               Multiplication result of adjacent odd-even elements
+ *               are added together and subtracted from double width elements
+ *               in_c vector.
+ * Example     : out = __lasx_xvdp2sub_w_h(in_c, in_h, in_l)
+ *        in_c : 0,0,0,0, 0,0,0,0
+ *        in_h : 3,1,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1
+ *        in_l : 2,1,1,0, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *         out : -7,-3,0,0, 0,-1,0,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp2sub_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvsub_w(in_c, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Dot product of halfword vector elements
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ *               Return Type - signed word
+ * Details     : Signed halfword elements from in_h are multiplied with
+ *               signed halfword elements from in_l producing a result
+ *               four times the size of input i.e. signed doubleword.
+ *               Then these multiplication results of four adjacent elements
+ *               are added together and stored to the out vector.
+ * Example     : out = __lasx_xvdp4_d_h(in_h, in_l)
+ *        in_h :  3,1,3,0, 0,0,0,1, 0,0,1,-1, 0,0,0,1
+ *        in_l : -2,1,1,0, 1,0,0,0, 0,0,1, 0, 1,0,0,1
+ *         out : -2,0,1,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvdp4_d_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvmulwev_w_h(in_h, in_l);
+  out = __lasx_xvmaddwod_w_h(out, in_h, in_l);
+  out = __lasx_xvhaddw_d_w(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwh_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The high half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               higher half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwh_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 1,0,0,-1, 1,0,0, 2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvh_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed byte
+ *               to signed halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_b(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_h_b(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are added after the
+ *               lower half of the two-fold sign extension (signed halfword
+ *               to signed word) and stored to the out vector.
+ * Example     : out = __lasx_xvaddwl_w_h(in_h, in_l)
+ *        in_h : 3, 0,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 1,0,1, 0, 1,0,0,1
+ *         out : 5,-1,4,2, 1,0,2,-1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_h(in_h, in_l);
+  out = __lasx_xvhaddw_w_h(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The out vector and the out vector are added after the
+ *               lower half of the two-fold zero extension (unsigned byte
+ *               to unsigned halfword) and stored to the out vector.
+ * Example     : See out = __lasx_xvaddwl_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddwl_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvilvl_b(in_h, in_l);
+  out = __lasx_xvhaddw_hu_bu(out, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double zero extension (unsigned byte to
+ *               signed halfword),added to the in_h vector.
+ * Example     : See out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_h_h_bu(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_hu_bu(in_l, 0);
+  out = __lasx_xvadd_h(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are expanded and
+ *               added after being doubled.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_l vector after double sign extension (signed halfword to
+ *               signed word), added to the in_h vector.
+ * Example     : out = __lasx_xvaddw_w_w_h(in_h, in_l)
+ *        in_h : 0, 1,0,0, -1,0,0,1,
+ *        in_l : 2,-1,1,2,  1,0,0,0, 0,0,1,0, 1,0,0,1,
+ *         out : 2, 0,1,2, -1,0,1,1,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvaddw_w_w_h(__m256i in_h, __m256i in_l) {
+  __m256i out;
+
+  out = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvadd_w(in_h, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the lower half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed halfword
+ *               to signed word), and the result is added to the vector in_c,
+ *               then stored to the out vector.
+ * Example     : out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ *        in_c : 1,2,3,4, 5,6,7,8
+ *        in_h : 1,2,3,4, 1,2,3,4, 5,6,7,8, 5,6,7,8
+ *        in_l : 200, 300, 400, 500,  2000, 3000, 4000, 5000,
+ *              -200,-300,-400,-500, -2000,-3000,-4000,-5000
+ *         out : 201, 602,1203,2004, -995, -1794,-2793,-3992
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwl_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  tmp0 = __lasx_xvmul_w(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication and addition calculation after expansion
+ *               of the higher half of the vector.
+ * Arguments   : Inputs - in_c, in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the higher half of the two-fold sign extension (signed
+ *               halfword to signed word), and the result is added to
+ *               the vector in_c, then stored to the out vector.
+ * Example     : See out = __lasx_xvmaddwl_w_h(in_c, in_h, in_l)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmaddwh_w_h(__m256i in_c, __m256i in_h,
+                                          __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  tmp0 = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  out = __lasx_xvadd_w(tmp0, in_c);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwl_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 6,1,3,0, 0,0,1,0
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwl_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvsllwil_w_h(in_h, 0);
+  tmp1 = __lasx_xvsllwil_w_h(in_l, 0);
+  out = __lasx_xvmul_w(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Multiplication calculation after expansion of the lower
+ *               half of the vector.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector and the in_l vector are multiplied after
+ *               the lower half of the two-fold sign extension (signed
+ *               halfword to signed word), then stored to the out vector.
+ * Example     : out = __lasx_xvmulwh_w_h(in_h, in_l)
+ *        in_h : 3,-1,3,0, 0,0,0,-1, 0,0,1,-1, 0,0,0,1
+ *        in_l : 2,-1,1,2, 1,0,0, 0, 0,0,1, 0, 1,0,0,1
+ *         out : 0,0,0,0, 0,0,0,1
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvmulwh_w_h(__m256i in_h, __m256i in_l) {
+  __m256i tmp0, tmp1, out;
+
+  tmp0 = __lasx_xvilvh_h(in_h, in_h);
+  tmp1 = __lasx_xvilvh_h(in_l, in_l);
+  out = __lasx_xvmulwev_w_h(tmp0, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : The low half of the vector elements are added to the high half
+ *               after being doubled, then saturated.
+ * Arguments   : Inputs - in_h, in_l
+ *               Output - out
+ * Details     : The in_h vector adds the in_l vector after the lower half of
+ *               the two-fold zero extension (unsigned byte to unsigned
+ *               halfword) and then saturated. The results are stored to the out
+ *               vector.
+ * Example     : out = __lasx_xvsaddw_hu_hu_bu(in_h, in_l)
+ *        in_h : 2,65532,1,2, 1,0,0,0, 0,0,1,0, 1,0,0,1
+ *        in_l : 3,6,3,0, 0,0,0,1, 0,0,1,1, 0,0,0,1, 3,18,3,0, 0,0,0,1, 0,0,1,1,
+ *               0,0,0,1
+ *        out  : 5,65535,4,2, 1,0,0,1, 3,18,4,0, 1,0,0,2,
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsaddw_hu_hu_bu(__m256i in_h, __m256i in_l) {
+  __m256i tmp1, out;
+  __m256i zero = { 0 };
+
+  tmp1 = __lasx_xvilvl_b(zero, in_l);
+  out = __lasx_xvsadd_hu(in_h, tmp1);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all halfword elements of input vector between min & max
+ *               out = ((in) < (min)) ? (min) : (((in) > (max)) ? (max) : (in))
+ * Arguments   : Inputs  - in    (input vector)
+ *                       - min   (min threshold)
+ *                       - max   (max threshold)
+ *               Outputs - in    (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : out = __lasx_xvclip_h(in, min, max)
+ *          in : -8,2,280,249, -8,255,280,249, 4,4,4,4, 5,5,5,5
+ *         min : 1,1,1,1, 1,1,1,1, 1,1,1,1, 1,1,1,1
+ *         max : 9,9,9,9, 9,9,9,9, 9,9,9,9, 9,9,9,9
+ *         out : 1,2,9,9, 1,9,9,9, 4,4,4,4, 5,5,5,5
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip_h(__m256i in, __m256i min, __m256i max) {
+  __m256i out;
+
+  out = __lasx_xvmax_h(min, in);
+  out = __lasx_xvmin_h(max, out);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed halfword elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs  - in   (input vector)
+ *               Outputs - out  (output vector with clipped elements)
+ *               Return Type - signed halfword
+ * Example     : See out = __lasx_xvclip255_w(in)
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_h(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_h(in, 0);
+  out = __lasx_xvsat_hu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Clip all signed word elements of input vector
+ *               between 0 & 255
+ * Arguments   : Inputs - in   (input vector)
+ *               Output - out  (output vector with clipped elements)
+ *               Return Type - signed word
+ * Example     : out = __lasx_xvclip255_w(in)
+ *          in : -8,255,280,249, -8,255,280,249
+ *         out :  0,255,255,249,  0,255,255,249
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvclip255_w(__m256i in) {
+  __m256i out;
+
+  out = __lasx_xvmaxi_w(in, 0);
+  out = __lasx_xvsat_wu(out, 7);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_l_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,0,2,0, 0,0,0,0
+ *         idx : 0x02
+ *         out : 11,11,11,11, 11,11,11,11, 11,11,11,11, 11,11,11,11
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_l_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x02);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Indexed halfword element values are replicated to all
+ *               elements in output vector. If 'idx < 8' use xvsplati_l_*,
+ *               if 'idx >= 8' use xvsplati_h_*.
+ * Arguments   : Inputs - in, idx
+ *               Output - out
+ * Details     : Idx element value from in vector is replicated to all
+ *               elements in out vector.
+ *               Valid index range for halfword operation is 0-7
+ * Example     : out = __lasx_xvsplati_h_h(in, idx)
+ *          in : 20,10,11,12, 13,14,15,16, 0,2,0,0, 0,0,0,0
+ *         idx : 0x09
+ *         out : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ * =============================================================================
+ */
+static inline __m256i __lasx_xvsplati_h_h(__m256i in, int idx) {
+  __m256i out;
+
+  out = __lasx_xvpermi_q(in, in, 0x13);
+  out = __lasx_xvreplve_h(out, idx);
+  return out;
+}
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with double-word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Example     : LASX_TRANSPOSE4x4_D
+ *        _in0 : 1,2,3,4
+ *        _in1 : 1,2,3,4
+ *        _in2 : 1,2,3,4
+ *        _in3 : 1,2,3,4
+ *
+ *       _out0 : 1,1,1,1
+ *       _out1 : 2,2,2,2
+ *       _out2 : 3,3,3,3
+ *       _out3 : 4,4,4,4
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _tmp0, _tmp1, _tmp2, _tmp3;                                  \
+    _tmp0 = __lasx_xvilvl_d(_in1, _in0);                                 \
+    _tmp1 = __lasx_xvilvh_d(_in1, _in0);                                 \
+    _tmp2 = __lasx_xvilvl_d(_in3, _in2);                                 \
+    _tmp3 = __lasx_xvilvh_d(_in3, _in2);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp2, _tmp0, 0x31);                        \
+    _out1 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp3, _tmp1, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with word elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *               _out7
+ * Example     : LASX_TRANSPOSE8x8_W
+ *        _in0 : 1,2,3,4,5,6,7,8
+ *        _in1 : 2,2,3,4,5,6,7,8
+ *        _in2 : 3,2,3,4,5,6,7,8
+ *        _in3 : 4,2,3,4,5,6,7,8
+ *        _in4 : 5,2,3,4,5,6,7,8
+ *        _in5 : 6,2,3,4,5,6,7,8
+ *        _in6 : 7,2,3,4,5,6,7,8
+ *        _in7 : 8,2,3,4,5,6,7,8
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8
+ *       _out1 : 2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_w(_in3, _in1);                                     \
+    _tmp0_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_w(_in3, _in1);                                     \
+    _tmp2_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvl_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_w(_in7, _in5);                                     \
+    _tmp4_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_w(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_w(_in7, _in5);                                     \
+    _tmp6_m = __lasx_xvilvl_w(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_w(_s1_m, _s0_m);                                 \
+    _out0 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x20);                        \
+    _out1 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x20);                        \
+    _out2 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x20);                        \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x20);                        \
+    _out4 = __lasx_xvpermi_q(_tmp4_m, _tmp0_m, 0x31);                        \
+    _out5 = __lasx_xvpermi_q(_tmp5_m, _tmp1_m, 0x31);                        \
+    _out6 = __lasx_xvpermi_q(_tmp6_m, _tmp2_m, 0x31);                        \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp3_m, 0x31);                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE16x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_b(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_b(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_b(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_b(_in15, _in13);                                  \
+    _out0 = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_b(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_b(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_b(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_b(_tmp7_m, _tmp6_m);                                \
+    _tmp0_m = __lasx_xvilvl_w(_out2, _out0);                                  \
+    _tmp2_m = __lasx_xvilvh_w(_out2, _out0);                                  \
+    _tmp4_m = __lasx_xvilvl_w(_out3, _out1);                                  \
+    _tmp6_m = __lasx_xvilvh_w(_out3, _out1);                                  \
+    _tmp1_m = __lasx_xvilvl_w(_out6, _out4);                                  \
+    _tmp3_m = __lasx_xvilvh_w(_out6, _out4);                                  \
+    _tmp5_m = __lasx_xvilvl_w(_out7, _out5);                                  \
+    _tmp7_m = __lasx_xvilvh_w(_out7, _out5);                                  \
+    _out0 = __lasx_xvilvl_d(_tmp1_m, _tmp0_m);                                \
+    _out1 = __lasx_xvilvh_d(_tmp1_m, _tmp0_m);                                \
+    _out2 = __lasx_xvilvl_d(_tmp3_m, _tmp2_m);                                \
+    _out3 = __lasx_xvilvh_d(_tmp3_m, _tmp2_m);                                \
+    _out4 = __lasx_xvilvl_d(_tmp5_m, _tmp4_m);                                \
+    _out5 = __lasx_xvilvh_d(_tmp5_m, _tmp4_m);                                \
+    _out6 = __lasx_xvilvl_d(_tmp7_m, _tmp6_m);                                \
+    _out7 = __lasx_xvilvh_d(_tmp7_m, _tmp6_m);                                \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 16x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,
+ *                         _in8, _in9, _in10, _in11, _in12, _in13, _in14, _in15
+ *                         (input 16x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x16 byte block)
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE16x8_H
+ *        _in0 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in1 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in2 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in3 : 4,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in4 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in5 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in6 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in7 : 8,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in8 : 9,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *        _in9 : 1,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in10 : 0,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in11 : 2,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in12 : 3,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in13 : 7,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in14 : 5,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *       _in15 : 6,2,3,4,5,6,7,8,0,0,0,0,0,0,0,0
+ *
+ *       _out0 : 1,2,3,4,5,6,7,8,9,1,0,2,3,7,5,6
+ *       _out1 : 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2
+ *       _out2 : 3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3
+ *       _out3 : 4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4
+ *       _out4 : 5,5,5,5,5,5,5,5,5,5,5,5,5,5,5,5
+ *       _out5 : 6,6,6,6,6,6,6,6,6,6,6,6,6,6,6,6
+ *       _out6 : 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7
+ *       _out7 : 8,8,8,8,8,8,8,8,8,8,8,8,8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE16x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                             _in8, _in9, _in10, _in11, _in12, _in13, _in14,   \
+                             _in15, _out0, _out1, _out2, _out3, _out4, _out5, \
+                             _out6, _out7)                                    \
+  {                                                                           \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                               \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                               \
+    __m256i _t0, _t1, _t2, _t3, _t4, _t5, _t6, _t7;                           \
+                                                                              \
+    _tmp0_m = __lasx_xvilvl_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvl_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvl_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvl_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvl_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvl_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvl_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvl_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out0 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out1 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out2 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out3 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+                                                                              \
+    _tmp0_m = __lasx_xvilvh_h(_in2, _in0);                                    \
+    _tmp1_m = __lasx_xvilvh_h(_in3, _in1);                                    \
+    _tmp2_m = __lasx_xvilvh_h(_in6, _in4);                                    \
+    _tmp3_m = __lasx_xvilvh_h(_in7, _in5);                                    \
+    _tmp4_m = __lasx_xvilvh_h(_in10, _in8);                                   \
+    _tmp5_m = __lasx_xvilvh_h(_in11, _in9);                                   \
+    _tmp6_m = __lasx_xvilvh_h(_in14, _in12);                                  \
+    _tmp7_m = __lasx_xvilvh_h(_in15, _in13);                                  \
+    _t0 = __lasx_xvilvl_h(_tmp1_m, _tmp0_m);                                  \
+    _t1 = __lasx_xvilvh_h(_tmp1_m, _tmp0_m);                                  \
+    _t2 = __lasx_xvilvl_h(_tmp3_m, _tmp2_m);                                  \
+    _t3 = __lasx_xvilvh_h(_tmp3_m, _tmp2_m);                                  \
+    _t4 = __lasx_xvilvl_h(_tmp5_m, _tmp4_m);                                  \
+    _t5 = __lasx_xvilvh_h(_tmp5_m, _tmp4_m);                                  \
+    _t6 = __lasx_xvilvl_h(_tmp7_m, _tmp6_m);                                  \
+    _t7 = __lasx_xvilvh_h(_tmp7_m, _tmp6_m);                                  \
+    _tmp0_m = __lasx_xvilvl_d(_t2, _t0);                                      \
+    _tmp2_m = __lasx_xvilvh_d(_t2, _t0);                                      \
+    _tmp4_m = __lasx_xvilvl_d(_t3, _t1);                                      \
+    _tmp6_m = __lasx_xvilvh_d(_t3, _t1);                                      \
+    _tmp1_m = __lasx_xvilvl_d(_t6, _t4);                                      \
+    _tmp3_m = __lasx_xvilvh_d(_t6, _t4);                                      \
+    _tmp5_m = __lasx_xvilvl_d(_t7, _t5);                                      \
+    _tmp7_m = __lasx_xvilvh_d(_t7, _t5);                                      \
+    _out4 = __lasx_xvpermi_q(_tmp1_m, _tmp0_m, 0x20);                         \
+    _out5 = __lasx_xvpermi_q(_tmp3_m, _tmp2_m, 0x20);                         \
+    _out6 = __lasx_xvpermi_q(_tmp5_m, _tmp4_m, 0x20);                         \
+    _out7 = __lasx_xvpermi_q(_tmp7_m, _tmp6_m, 0x20);                         \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 4x4 block with halfword elements in vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ *               Return Type - signed halfword
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE4x4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, \
+                            _out3)                                       \
+  {                                                                      \
+    __m256i _s0_m, _s1_m;                                                \
+                                                                         \
+    _s0_m = __lasx_xvilvl_h(_in1, _in0);                                 \
+    _s1_m = __lasx_xvilvl_h(_in3, _in2);                                 \
+    _out0 = __lasx_xvilvl_w(_s1_m, _s0_m);                               \
+    _out2 = __lasx_xvilvh_w(_s1_m, _s0_m);                               \
+    _out1 = __lasx_xvilvh_d(_out0, _out0);                               \
+    _out3 = __lasx_xvilvh_d(_out2, _out2);                               \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose input 8x8 byte block
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7
+ *                         (input 8x8 byte block)
+ *               Outputs - _out0, _out1, _out2, _out3, _out4, _out5, _out6,
+ *                         _out7 (output 8x8 byte block)
+ * Example     : See LASX_TRANSPOSE8x8_H
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+    _tmp0_m = __lasx_xvilvl_b(_in2, _in0);                                   \
+    _tmp1_m = __lasx_xvilvl_b(_in3, _in1);                                   \
+    _tmp2_m = __lasx_xvilvl_b(_in6, _in4);                                   \
+    _tmp3_m = __lasx_xvilvl_b(_in7, _in5);                                   \
+    _tmp4_m = __lasx_xvilvl_b(_tmp1_m, _tmp0_m);                             \
+    _tmp5_m = __lasx_xvilvh_b(_tmp1_m, _tmp0_m);                             \
+    _tmp6_m = __lasx_xvilvl_b(_tmp3_m, _tmp2_m);                             \
+    _tmp7_m = __lasx_xvilvh_b(_tmp3_m, _tmp2_m);                             \
+    _out0 = __lasx_xvilvl_w(_tmp6_m, _tmp4_m);                               \
+    _out2 = __lasx_xvilvh_w(_tmp6_m, _tmp4_m);                               \
+    _out4 = __lasx_xvilvl_w(_tmp7_m, _tmp5_m);                               \
+    _out6 = __lasx_xvilvh_w(_tmp7_m, _tmp5_m);                               \
+    _out1 = __lasx_xvbsrl_v(_out0, 8);                                       \
+    _out3 = __lasx_xvbsrl_v(_out2, 8);                                       \
+    _out5 = __lasx_xvbsrl_v(_out4, 8);                                       \
+    _out7 = __lasx_xvbsrl_v(_out6, 8);                                       \
+  }
+
+/*
+ * =============================================================================
+ * Description : Transpose 8x8 block with halfword elements in vectors.
+ * Arguments   : Inputs  - _in0, _in1, ~
+ *               Outputs - _out0, _out1, ~
+ * Details     : The rows of the matrix become columns, and the columns become
+ *               rows.
+ * Example     : LASX_TRANSPOSE8x8_H
+ *        _in0 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in1 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in2 : 8,2,3,4, 5,6,7,8, 8,2,3,4, 5,6,7,8
+ *        _in3 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in4 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *        _in5 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in6 : 1,2,3,4, 5,6,7,8, 1,2,3,4, 5,6,7,8
+ *        _in7 : 9,2,3,4, 5,6,7,8, 9,2,3,4, 5,6,7,8
+ *
+ *       _out0 : 1,8,8,1, 9,1,1,9, 1,8,8,1, 9,1,1,9
+ *       _out1 : 2,2,2,2, 2,2,2,2, 2,2,2,2, 2,2,2,2
+ *       _out2 : 3,3,3,3, 3,3,3,3, 3,3,3,3, 3,3,3,3
+ *       _out3 : 4,4,4,4, 4,4,4,4, 4,4,4,4, 4,4,4,4
+ *       _out4 : 5,5,5,5, 5,5,5,5, 5,5,5,5, 5,5,5,5
+ *       _out5 : 6,6,6,6, 6,6,6,6, 6,6,6,6, 6,6,6,6
+ *       _out6 : 7,7,7,7, 7,7,7,7, 7,7,7,7, 7,7,7,7
+ *       _out7 : 8,8,8,8, 8,8,8,8, 8,8,8,8, 8,8,8,8
+ * =============================================================================
+ */
+#define LASX_TRANSPOSE8x8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                            _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                            _out7)                                           \
+  {                                                                          \
+    __m256i _s0_m, _s1_m;                                                    \
+    __m256i _tmp0_m, _tmp1_m, _tmp2_m, _tmp3_m;                              \
+    __m256i _tmp4_m, _tmp5_m, _tmp6_m, _tmp7_m;                              \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvl_h(_in7, _in5);                                     \
+    _tmp0_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp1_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in6, _in4);                                     \
+    _s1_m = __lasx_xvilvh_h(_in7, _in5);                                     \
+    _tmp2_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp3_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _s0_m = __lasx_xvilvl_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvl_h(_in3, _in1);                                     \
+    _tmp4_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp5_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+    _s0_m = __lasx_xvilvh_h(_in2, _in0);                                     \
+    _s1_m = __lasx_xvilvh_h(_in3, _in1);                                     \
+    _tmp6_m = __lasx_xvilvl_h(_s1_m, _s0_m);                                 \
+    _tmp7_m = __lasx_xvilvh_h(_s1_m, _s0_m);                                 \
+                                                                             \
+    _out0 = __lasx_xvpickev_d(_tmp0_m, _tmp4_m);                             \
+    _out2 = __lasx_xvpickev_d(_tmp1_m, _tmp5_m);                             \
+    _out4 = __lasx_xvpickev_d(_tmp2_m, _tmp6_m);                             \
+    _out6 = __lasx_xvpickev_d(_tmp3_m, _tmp7_m);                             \
+    _out1 = __lasx_xvpickod_d(_tmp0_m, _tmp4_m);                             \
+    _out3 = __lasx_xvpickod_d(_tmp1_m, _tmp5_m);                             \
+    _out5 = __lasx_xvpickod_d(_tmp2_m, _tmp6_m);                             \
+    _out7 = __lasx_xvpickod_d(_tmp3_m, _tmp7_m);                             \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 4 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3
+ *               Outputs - _out0, _out1, _out2, _out3
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_4
+ *               _out0 = _in0 + _in3;
+ *               _out1 = _in1 + _in2;
+ *               _out2 = _in1 - _in2;
+ *               _out3 = _in0 - _in3;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_4_B(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_b(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_b(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_b(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_b(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_H(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_h(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_h(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_h(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_h(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_W(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_w(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_w(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_w(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_w(_in0, _in3);                                        \
+  }
+#define LASX_BUTTERFLY_4_D(_in0, _in1, _in2, _in3, _out0, _out1, _out2, _out3) \
+  {                                                                            \
+    _out0 = __lasx_xvadd_d(_in0, _in3);                                        \
+    _out1 = __lasx_xvadd_d(_in1, _in2);                                        \
+    _out2 = __lasx_xvsub_d(_in1, _in2);                                        \
+    _out3 = __lasx_xvsub_d(_in0, _in3);                                        \
+  }
+
+/*
+ * =============================================================================
+ * Description : Butterfly of 8 input vectors
+ * Arguments   : Inputs  - _in0, _in1, _in2, _in3, ~
+ *               Outputs - _out0, _out1, _out2, _out3, ~
+ * Details     : Butterfly operation
+ * Example     : LASX_BUTTERFLY_8
+ *               _out0 = _in0 + _in7;
+ *               _out1 = _in1 + _in6;
+ *               _out2 = _in2 + _in5;
+ *               _out3 = _in3 + _in4;
+ *               _out4 = _in3 - _in4;
+ *               _out5 = _in2 - _in5;
+ *               _out6 = _in1 - _in6;
+ *               _out7 = _in0 - _in7;
+ * =============================================================================
+ */
+#define LASX_BUTTERFLY_8_B(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_b(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_b(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_b(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_b(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_b(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_b(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_b(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_b(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_H(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_h(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_h(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_h(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_h(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_h(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_h(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_h(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_h(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_W(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_w(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_w(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_w(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_w(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_w(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_w(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_w(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_w(_in0, _in7);                                     \
+  }
+
+#define LASX_BUTTERFLY_8_D(_in0, _in1, _in2, _in3, _in4, _in5, _in6, _in7,  \
+                           _out0, _out1, _out2, _out3, _out4, _out5, _out6, \
+                           _out7)                                           \
+  {                                                                         \
+    _out0 = __lasx_xvadd_d(_in0, _in7);                                     \
+    _out1 = __lasx_xvadd_d(_in1, _in6);                                     \
+    _out2 = __lasx_xvadd_d(_in2, _in5);                                     \
+    _out3 = __lasx_xvadd_d(_in3, _in4);                                     \
+    _out4 = __lasx_xvsub_d(_in3, _in4);                                     \
+    _out5 = __lasx_xvsub_d(_in2, _in5);                                     \
+    _out6 = __lasx_xvsub_d(_in1, _in6);                                     \
+    _out7 = __lasx_xvsub_d(_in0, _in7);                                     \
+  }
+
+#endif  // LASX
+
+/*
+ * =============================================================================
+ * Description : Print out elements in vector.
+ * Arguments   : Inputs  - RTYPE, _element_num, _in0, _enter
+ *               Outputs -
+ * Details     : Print out '_element_num' elements in 'RTYPE' vector '_in0', if
+ *               '_enter' is TRUE, prefix "\nVP:" will be added first.
+ * Example     : VECT_PRINT(v4i32,4,in0,1); // in0: 1,2,3,4
+ *               VP:1,2,3,4,
+ * =============================================================================
+ */
+#define VECT_PRINT(RTYPE, element_num, in0, enter)                 \
+  {                                                                \
+    RTYPE _tmp0 = (RTYPE)in0;                                      \
+    int _i = 0;                                                    \
+    if (enter) printf("\nVP:");                                    \
+    for (_i = 0; _i < element_num; _i++) printf("%d,", _tmp0[_i]); \
+  }
+
+#endif /* LOONGSON_INTRINSICS_H */
+#endif /* VPX_VPX_UTIL_LOONGSON_INTRINSICS_H_ */
diff --git a/libvpx/vpxdec.c b/libvpx/vpxdec.c
index ad368a2..84cef7d 100644
--- a/libvpx/vpxdec.c
+++ b/libvpx/vpxdec.c
@@ -581,7 +581,10 @@
   /* Parse command line */
   exec_name = argv_[0];
   argv = argv_dup(argc - 1, argv_ + 1);
-
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
@@ -815,6 +818,10 @@
     ext_fb_list.num_external_frame_buffers = num_external_frame_buffers;
     ext_fb_list.ext_fb = (struct ExternalFrameBuffer *)calloc(
         num_external_frame_buffers, sizeof(*ext_fb_list.ext_fb));
+    if (!ext_fb_list.ext_fb) {
+      fprintf(stderr, "Failed to allocate ExternalFrameBuffer\n");
+      goto fail;
+    }
     if (vpx_codec_set_frame_buffer_functions(&decoder, get_vp9_frame_buffer,
                                              release_vp9_frame_buffer,
                                              &ext_fb_list)) {
@@ -930,6 +937,11 @@
           }
           scaled_img =
               vpx_img_alloc(NULL, img->fmt, render_width, render_height, 16);
+          if (!scaled_img) {
+            fprintf(stderr, "Failed to allocate scaled image (%d x %d)\n",
+                    render_width, render_height);
+            goto fail;
+          }
           scaled_img->bit_depth = img->bit_depth;
         }
 
@@ -966,6 +978,10 @@
         if (!img_shifted) {
           img_shifted =
               vpx_img_alloc(NULL, shifted_fmt, img->d_w, img->d_h, 16);
+          if (!img_shifted) {
+            fprintf(stderr, "Failed to allocate image\n");
+            goto fail;
+          }
           img_shifted->bit_depth = output_bit_depth;
         }
         if (output_bit_depth > img->bit_depth) {
@@ -1110,6 +1126,10 @@
   int error = 0;
 
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   for (argi = argj = argv; (*argj = *argi); argi += arg.argv_step) {
     memset(&arg, 0, sizeof(arg));
     arg.argv_step = 1;
diff --git a/libvpx/vpxenc.c b/libvpx/vpxenc.c
index a0122ef..7eff97b 100644
--- a/libvpx/vpxenc.c
+++ b/libvpx/vpxenc.c
@@ -58,8 +58,8 @@
 
 static const char *exec_name;
 
-static void warn_or_exit_on_errorv(vpx_codec_ctx_t *ctx, int fatal,
-                                   const char *s, va_list ap) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 0) void warn_or_exit_on_errorv(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, va_list ap) {
   if (ctx->err) {
     const char *detail = vpx_codec_error_detail(ctx);
 
@@ -72,7 +72,9 @@
   }
 }
 
-static void ctx_exit_on_error(vpx_codec_ctx_t *ctx, const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(2,
+                               3) void ctx_exit_on_error(vpx_codec_ctx_t *ctx,
+                                                         const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -80,8 +82,8 @@
   va_end(ap);
 }
 
-static void warn_or_exit_on_error(vpx_codec_ctx_t *ctx, int fatal,
-                                  const char *s, ...) {
+static VPX_TOOLS_FORMAT_PRINTF(3, 4) void warn_or_exit_on_error(
+    vpx_codec_ctx_t *ctx, int fatal, const char *s, ...) {
   va_list ap;
 
   va_start(ap, s);
@@ -1701,6 +1703,10 @@
    * codec.
    */
   argv = argv_dup(argc - 1, argv_ + 1);
+  if (!argv) {
+    fprintf(stderr, "Error allocating argument list\n");
+    return EXIT_FAILURE;
+  }
   parse_global_config(&global, argv);
 
   if (argc < 3) usage_exit();
diff --git a/libvpx/vpxstats.c b/libvpx/vpxstats.c
index 142e367..c0dd14e 100644
--- a/libvpx/vpxstats.c
+++ b/libvpx/vpxstats.c
@@ -41,7 +41,7 @@
     stats->buf.buf = malloc(stats->buf_alloc_sz);
 
     if (!stats->buf.buf)
-      fatal("Failed to allocate first-pass stats buffer (%lu bytes)",
+      fatal("Failed to allocate first-pass stats buffer (%u bytes)",
             (unsigned int)stats->buf_alloc_sz);
 
     nbytes = fread(stats->buf.buf, 1, stats->buf.sz, stats->file);
diff --git a/libvpx/warnings.c b/libvpx/warnings.c
index a80da52..3e6e702 100644
--- a/libvpx/warnings.c
+++ b/libvpx/warnings.c
@@ -98,7 +98,7 @@
   /* Count and print warnings. */
   for (warning = warning_list.warning_node; warning != NULL;
        warning = warning->next_warning, ++num_warnings) {
-    warn(warning->warning_string);
+    warn("%s", warning->warning_string);
   }
 
   free_warning_list(&warning_list);
diff --git a/libvpx/webmdec.cc b/libvpx/webmdec.cc
index 68c6f47..f7671bb 100644
--- a/libvpx/webmdec.cc
+++ b/libvpx/webmdec.cc
@@ -210,6 +210,8 @@
   vpx_ctx->framerate.denominator =
       static_cast<int>(webm_ctx->timestamp_ns / 1000);
   delete[] buffer;
+  // webm_ctx->buffer is assigned to the buffer pointer in webm_read_frame().
+  webm_ctx->buffer = nullptr;
 
   get_first_cluster(webm_ctx);
   webm_ctx->block = nullptr;
diff --git a/libvpx/y4minput.c b/libvpx/y4minput.c
index 9a4bdbd..7d3c03a 100644
--- a/libvpx/y4minput.c
+++ b/libvpx/y4minput.c
@@ -1087,9 +1087,15 @@
     y4m_ctx->dst_buf = (unsigned char *)malloc(y4m_ctx->dst_buf_sz);
   else
     y4m_ctx->dst_buf = (unsigned char *)malloc(2 * y4m_ctx->dst_buf_sz);
+  if (!y4m_ctx->dst_buf) return -1;
 
-  if (y4m_ctx->aux_buf_sz > 0)
+  if (y4m_ctx->aux_buf_sz > 0) {
     y4m_ctx->aux_buf = (unsigned char *)malloc(y4m_ctx->aux_buf_sz);
+    if (!y4m_ctx->aux_buf) {
+      free(y4m_ctx->dst_buf);
+      return -1;
+    }
+  }
   return 0;
 }