Import SIMD intrinsics from more recent Thor code

This enables ARMv8/aarch64 optimisations of CDEF as well as a few
minor improvements to x86 and ARMv7.  Several new intrinsics also
added, which makes it possible to remove x86 specific code in the CDEF
code.  Also, various sanitizer warnings have been addressed (mostly
related to intended two-complement overflow/underflow).  And there are
several AVX2 improvements.

New intrinsics: v64_sadd_s8, v64_sadd_u8, v64_pack_s32_u16,
v64_rdavg_u16, v128_sad_u16, v128_ssd_s16, v128_sadd_s8, v128_sadd_u8,
v128_add_64, v128_sub_64, v128_pack_s32_u16, v128_rdavg_u16,
v128_min_s32, v128_max_s32, v128_cmpgt_s32, v128_cmpeq_32,
v128_cmplt_s32, v128_padd_u8, v128_shl_n_64, v128_shr_n_u64,
v128_shr_n_s64, v128_shr_s64, v128_shr_u64, v128_shl_64,
v128_dotp_su8, v128_dotp_s32, v128_movemask_8, v128_dup_64,
v128_blend_8, v256_sad_u16, v256_ssd_s16, v256_low_u64, v256_dotp_su8,
v256_dotp_s32, v256_sadd_s8, v256_sadd_u8, v256_add_64, v256_sub_64,
v256_pack_s32_u16, v256_rdavg_u16, v256_min_s32, v256_max_s32,
v256_cmpgt_s32, v256_cmplt_s32, v256_cmpeq_32, v256_wideshuffle_8,
v256_padd_u8, v256_shl_n_64, v256_shr_n_u64, v256_shr_n_s64,
v256_shr_s64, v256_shr_u64, v256_shl_64, v256_movemask_8, v256_dup_64,
v256_blend_8, v256_unziplo_64, v256_unziphi_64

The unit tests have been updated.

Change-Id: If051e902f2095e3a02aaf13cf1230475392f051e
diff --git a/aom_dsp/simd/v128_intrinsics.h b/aom_dsp/simd/v128_intrinsics.h
index d7910ec..51a38a7 100644
--- a/aom_dsp/simd/v128_intrinsics.h
+++ b/aom_dsp/simd/v128_intrinsics.h
@@ -58,6 +58,7 @@
 SIMD_INLINE v128 v128_dup_8(uint8_t x) { return c_v128_dup_8(x); }
 SIMD_INLINE v128 v128_dup_16(uint16_t x) { return c_v128_dup_16(x); }
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return c_v128_dup_32(x); }
+SIMD_INLINE v128 v128_dup_64(uint64_t x) { return c_v128_dup_64(x); }
 
 typedef uint32_t sad128_internal;
 SIMD_INLINE sad128_internal v128_sad_u8_init() { return c_v128_sad_u8_init(); }
@@ -75,9 +76,15 @@
 SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
   return c_v128_ssd_u8_sum(s);
 }
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  return c_v128_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return c_v128_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  return c_v128_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v128_hadd_u8(v128 a) { return c_v128_hadd_u8(a); }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return c_v128_or(a, b); }
@@ -87,8 +94,12 @@
 
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return c_v128_add_8(a, b); }
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return c_v128_add_16(a, b); }
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return c_v128_sadd_u8(a, b); }
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return c_v128_sadd_s8(a, b); }
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return c_v128_sadd_s16(a, b); }
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return c_v128_add_32(a, b); }
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return c_v128_add_64(a, b); }
+SIMD_INLINE v128 v128_padd_u8(v128 a) { return c_v128_padd_u8(a); }
 SIMD_INLINE v128 v128_padd_s16(v128 a) { return c_v128_padd_s16(a); }
 SIMD_INLINE v128 v128_sub_8(v128 a, v128 b) { return c_v128_sub_8(a, b); }
 SIMD_INLINE v128 v128_ssub_u8(v128 a, v128 b) { return c_v128_ssub_u8(a, b); }
@@ -97,6 +108,7 @@
 SIMD_INLINE v128 v128_ssub_s16(v128 a, v128 b) { return c_v128_ssub_s16(a, b); }
 SIMD_INLINE v128 v128_ssub_u16(v128 a, v128 b) { return c_v128_ssub_u16(a, b); }
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return c_v128_sub_32(a, b); }
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return c_v128_sub_64(a, b); }
 SIMD_INLINE v128 v128_abs_s16(v128 a) { return c_v128_abs_s16(a); }
 SIMD_INLINE v128 v128_abs_s8(v128 a) { return c_v128_abs_s8(a); }
 
@@ -113,8 +125,16 @@
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return c_v128_madd_s16(a, b); }
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) { return c_v128_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return c_v128_movemask_8(a); }
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  return c_v128_blend_8(a, b, c);
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return c_v128_avg_u8(a, b); }
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) { return c_v128_rdavg_u8(a, b); }
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return c_v128_rdavg_u16(a, b);
+}
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return c_v128_avg_u16(a, b); }
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return c_v128_min_u8(a, b); }
 SIMD_INLINE v128 v128_max_u8(v128 a, v128 b) { return c_v128_max_u8(a, b); }
@@ -122,6 +142,8 @@
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) { return c_v128_max_s8(a, b); }
 SIMD_INLINE v128 v128_min_s16(v128 a, v128 b) { return c_v128_min_s16(a, b); }
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return c_v128_max_s16(a, b); }
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) { return c_v128_min_s32(a, b); }
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) { return c_v128_max_s32(a, b); }
 
 SIMD_INLINE v128 v128_ziplo_8(v128 a, v128 b) { return c_v128_ziplo_8(a, b); }
 SIMD_INLINE v128 v128_ziphi_8(v128 a, v128 b) { return c_v128_ziphi_8(a, b); }
@@ -169,6 +191,9 @@
 SIMD_INLINE v128 v128_pack_s32_s16(v128 a, v128 b) {
   return c_v128_pack_s32_s16(a, b);
 }
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return c_v128_pack_s32_u16(a, b);
+}
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return c_v128_pack_s16_u8(a, b);
 }
@@ -204,6 +229,14 @@
 }
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return c_v128_cmpeq_16(a, b); }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return c_v128_cmpgt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return c_v128_cmplt_s32(a, b);
+}
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return c_v128_cmpeq_32(a, b); }
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
   return c_v128_shl_8(a, c);
 }
@@ -231,6 +264,15 @@
 SIMD_INLINE v128 v128_shr_s32(v128 a, unsigned int c) {
   return c_v128_shr_s32(a, c);
 }
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return c_v128_shl_64(a, c);
+}
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return c_v128_shr_u64(a, c);
+}
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return c_v128_shr_s64(a, c);
+}
 
 SIMD_INLINE v128 v128_shr_n_byte(v128 a, unsigned int n) {
   return c_v128_shr_n_byte(a, n);
@@ -247,6 +289,9 @@
 SIMD_INLINE v128 v128_shl_n_32(v128 a, unsigned int n) {
   return c_v128_shl_n_32(a, n);
 }
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int n) {
+  return c_v128_shl_n_64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_u8(v128 a, unsigned int n) {
   return c_v128_shr_n_u8(a, n);
 }
@@ -256,6 +301,9 @@
 SIMD_INLINE v128 v128_shr_n_u32(v128 a, unsigned int n) {
   return c_v128_shr_n_u32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int n) {
+  return c_v128_shr_n_u64(a, n);
+}
 SIMD_INLINE v128 v128_shr_n_s8(v128 a, unsigned int n) {
   return c_v128_shr_n_s8(a, n);
 }
@@ -265,5 +313,32 @@
 SIMD_INLINE v128 v128_shr_n_s32(v128 a, unsigned int n) {
   return c_v128_shr_n_s32(a, n);
 }
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int n) {
+  return c_v128_shr_n_s64(a, n);
+}
+
+typedef uint32_t sad128_internal_u16;
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() {
+  return c_v128_sad_u16_init();
+}
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return c_v128_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return c_v128_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() {
+  return c_v128_ssd_s16_init();
+}
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  return c_v128_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return c_v128_ssd_s16_sum(s);
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/aom_dsp/simd/v128_intrinsics_arm.h b/aom_dsp/simd/v128_intrinsics_arm.h
index ceb253a..d4fec42 100644
--- a/aom_dsp/simd/v128_intrinsics_arm.h
+++ b/aom_dsp/simd/v128_intrinsics_arm.h
@@ -84,21 +84,54 @@
   return vreinterpretq_s64_u32(vdupq_n_u32(x));
 }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  return vreinterpretq_s64_u64(vdupq_n_u64(x));
+}
+
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  int16x8_t t1 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(b)))));
+  int16x8_t t2 = vmulq_s16(
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(a))),
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(b)))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t1) + vaddlvq_s16(t2);
+#else
+  int64x2_t t = vpaddlq_s32(vaddq_s32(vpaddlq_s16(t1), vpaddlq_s16(t2)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+#endif
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   return v64_dotp_s16(vget_high_s64(a), vget_high_s64(b)) +
          v64_dotp_s16(vget_low_s64(a), vget_low_s64(b));
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  int64x2_t t = vpaddlq_s32(
+      vmulq_s32(vreinterpretq_s32_s64(a), vreinterpretq_s32_s64(b)));
+  return (int64_t)vget_high_s64(t) + (int64_t)vget_low_s64(t);
+}
+
 SIMD_INLINE uint64_t v128_hadd_u8(v128 x) {
+#if defined(__aarch64__)
+  return vaddlvq_u8(vreinterpretq_u8_s64(x));
+#else
   uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(vreinterpretq_u8_s64(x))));
   return vget_lane_s32(
       vreinterpret_s32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return vreinterpretq_s64_s32(vpaddlq_s16(vreinterpretq_s16_s64(a)));
 }
 
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return vreinterpretq_s64_u16(vpaddlq_u8(vreinterpretq_u8_s64(a)));
+}
+
 typedef struct {
   sad64_internal hi, lo;
 } sad128_internal;
@@ -120,7 +153,12 @@
 }
 
 SIMD_INLINE uint32_t v128_sad_u8_sum(sad128_internal s) {
-  return (uint32_t)(v64_sad_u8_sum(s.hi) + v64_sad_u8_sum(s.lo));
+#if defined(__aarch64__)
+  return vaddlvq_u16(s.hi) + vaddlvq_u16(s.lo);
+#else
+  uint64x2_t t = vpaddlq_u32(vpaddlq_u16(vaddq_u16(s.hi, s.lo)));
+  return (uint32_t)(uint64_t)(vget_high_u64(t) + vget_low_u64(t));
+#endif
 }
 
 typedef struct {
@@ -129,7 +167,7 @@
 
 SIMD_INLINE ssd128_internal v128_ssd_u8_init() {
   ssd128_internal s;
-  s.hi = s.lo = (ssd64_internal)(uint64_t)0;
+  s.hi = s.lo = v64_ssd_u8_init();
   return s;
 }
 
@@ -159,6 +197,16 @@
       vaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 x, v128 y) {
+  return vreinterpretq_s64_u8(
+      vqaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
+}
+
+SIMD_INLINE v128 v128_sadd_s8(v128 x, v128 y) {
+  return vreinterpretq_s64_s8(
+      vqaddq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
+}
+
 SIMD_INLINE v128 v128_add_16(v128 x, v128 y) {
   return vreinterpretq_s64_s16(
       vaddq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
@@ -174,6 +222,11 @@
       vaddq_u32(vreinterpretq_u32_s64(x), vreinterpretq_u32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_add_64(v128 x, v128 y) {
+  return vreinterpretq_s64_u64(
+      vaddq_u64(vreinterpretq_u64_s64(x), vreinterpretq_u64_s64(y)));
+}
+
 SIMD_INLINE v128 v128_sub_8(v128 x, v128 y) {
   return vreinterpretq_s64_u8(
       vsubq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
@@ -209,6 +262,8 @@
       vsubq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
 }
 
+SIMD_INLINE v128 v128_sub_64(v128 x, v128 y) { return vsubq_s64(x, y); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 x) {
   return vreinterpretq_s64_s16(vabsq_s16(vreinterpretq_s16_s64(x)));
 }
@@ -228,8 +283,16 @@
 }
 
 SIMD_INLINE v128 v128_mulhi_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_s16(vuzp2q_s16(
+      vreinterpretq_s16_s32(vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                                      vreinterpret_s16_s64(vget_low_s64(b)))),
+      vreinterpretq_s16_s32(
+          vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b)))));
+#else
   return v128_from_v64(v64_mulhi_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_mulhi_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_mullo_s32(v128 a, v128 b) {
@@ -238,13 +301,32 @@
 }
 
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int32x4_t t1 = vmull_s16(vreinterpret_s16_s64(vget_low_s64(a)),
+                           vreinterpret_s16_s64(vget_low_s64(b)));
+  int32x4_t t2 =
+      vmull_high_s16(vreinterpretq_s16_s64(a), vreinterpretq_s16_s64(b));
+  return vreinterpretq_s64_s32(vpaddq_s32(t1, t2));
+#else
   return v128_from_v64(v64_madd_s16(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_s16(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
+#if defined(__aarch64__)
+  int16x8_t t1 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_low_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_low_s64(b))));
+  int16x8_t t2 = vmulq_s16(
+      vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(vget_high_s64(a)))),
+      vmovl_s8(vreinterpret_s8_s64(vget_high_s64(b))));
+  return vreinterpretq_s64_s16(
+      vqaddq_s16(vuzp1q_s16(t1, t2), vuzp2q_s16(t1, t2)));
+#else
   return v128_from_v64(v64_madd_us8(vget_high_s64(a), vget_high_s64(b)),
                        v64_madd_us8(vget_low_s64(a), vget_low_s64(b)));
+#endif
 }
 
 SIMD_INLINE v128 v128_avg_u8(v128 x, v128 y) {
@@ -257,6 +339,11 @@
       vhaddq_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(y)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 x, v128 y) {
+  return vreinterpretq_s64_u16(
+      vhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 x, v128 y) {
   return vreinterpretq_s64_u16(
       vrhaddq_u16(vreinterpretq_u16_s64(x), vreinterpretq_u16_s64(y)));
@@ -277,6 +364,26 @@
       vminq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) {
+  a = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(a), vdupq_n_s8(0)));
+#if defined(__aarch64__)
+  uint8x16_t m =
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL)));
+  return vaddv_u8(vget_low_u8(m)) + (vaddv_u8(vget_high_u8(m)) << 8);
+#else
+  uint64x2_t m = vpaddlq_u32(vpaddlq_u16(vpaddlq_u8(
+      vandq_u8(vreinterpretq_u8_s64(a),
+               vreinterpretq_u8_u64(vdupq_n_u64(0x8040201008040201ULL))))));
+  return v64_u64(v64_ziplo_8(v128_high_v64((v128)m), v128_low_v64((v128)m)));
+#endif
+}
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+  c = vreinterpretq_s64_u8(vcltq_s8(vreinterpretq_s8_s64(c), vdupq_n_s8(0)));
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 x, v128 y) {
   return vreinterpretq_s64_s8(
       vmaxq_s8(vreinterpretq_s8_s64(x), vreinterpretq_s8_s64(y)));
@@ -292,14 +399,34 @@
       vmaxq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_min_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vminq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_s32(
+      vmaxq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_ziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vzip2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vzipq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_8(v64 x, v64 y) {
@@ -308,13 +435,23 @@
 }
 
 SIMD_INLINE v128 v128_ziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vzip2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   int16x8x2_t r = vzipq_s16(vreinterpretq_s16_s64(y), vreinterpretq_s16_s64(x));
   return vreinterpretq_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_16(v64 x, v64 y) {
@@ -323,13 +460,23 @@
 }
 
 SIMD_INLINE v128 v128_ziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_ziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vzip2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   int32x4x2_t r = vzipq_s32(vreinterpretq_s32_s64(y), vreinterpretq_s32_s64(x));
   return vreinterpretq_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_zip_32(v64 x, v64 y) {
@@ -347,37 +494,67 @@
 }
 
 SIMD_INLINE v128 v128_unziplo_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp1q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_8(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vuzp2q_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x)));
+#else
   uint8x16x2_t r = vuzpq_u8(vreinterpretq_u8_s64(y), vreinterpretq_u8_s64(x));
   return vreinterpretq_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp1q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_16(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u16(
+      vuzp2q_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x)));
+#else
   uint16x8x2_t r =
       vuzpq_u16(vreinterpretq_u16_s64(y), vreinterpretq_u16_s64(x));
   return vreinterpretq_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziplo_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp1q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unziphi_32(v128 x, v128 y) {
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u32(
+      vuzp2q_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x)));
+#else
   uint32x4x2_t r =
       vuzpq_u32(vreinterpretq_u32_s64(y), vreinterpretq_u32_s64(x));
   return vreinterpretq_s64_u32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v128 v128_unpack_u8_s16(v64 a) {
@@ -410,6 +587,12 @@
       vreinterpret_s64_s16(vqmovn_s32(vreinterpretq_s32_s64(b))));
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+  return v128_from_v64(
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(a))),
+      vreinterpret_s64_u16(vqmovun_s32(vreinterpretq_s32_s64(b))));
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return v128_from_v64(
       vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s64(a))),
@@ -451,15 +634,17 @@
 }
 
 SIMD_INLINE v128 v128_shuffle_8(v128 x, v128 pattern) {
-  return v128_from_64(
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_high_s64(pattern)))),
-      (uint64_t)vreinterpret_s64_u8(
-          vtbl2_u8((uint8x8x2_t){ { vget_low_u8(vreinterpretq_u8_s64(x)),
-                                    vget_high_u8(vreinterpretq_u8_s64(x)) } },
-                   vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#if defined(__aarch64__)
+  return vreinterpretq_s64_u8(
+      vqtbl1q_u8(vreinterpretq_u8_s64(x), vreinterpretq_u8_s64(pattern)));
+#else
+  uint8x8x2_t p = { { vget_low_u8(vreinterpretq_u8_s64(x)),
+                      vget_high_u8(vreinterpretq_u8_s64(x)) } };
+  return v128_from_64((uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_high_s64(pattern)))),
+                      (uint64_t)vreinterpret_s64_u8(vtbl2_u8(
+                          p, vreinterpret_u8_s64(vget_low_s64(pattern)))));
+#endif
 }
 
 SIMD_INLINE v128 v128_cmpgt_s8(v128 x, v128 y) {
@@ -492,6 +677,21 @@
       vceqq_s16(vreinterpretq_s16_s64(x), vreinterpretq_s16_s64(y)));
 }
 
+SIMD_INLINE v128 v128_cmpgt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcgtq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vcltq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
+SIMD_INLINE v128 v128_cmpeq_32(v128 x, v128 y) {
+  return vreinterpretq_s64_u32(
+      vceqq_s32(vreinterpretq_s32_s64(x), vreinterpretq_s32_s64(y)));
+}
+
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
   return (c > 7) ? v128_zero()
                  : vreinterpretq_s64_u8(
@@ -546,6 +746,22 @@
                         vshlq_s32(vreinterpretq_s32_s64(a), vdupq_n_s32(-c)));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(c)));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_zero()
+                  : vreinterpretq_s64_u64(
+                        vshlq_u64(vreinterpretq_u64_s64(a), vdupq_n_s64(-c)));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  return (c > 63) ? v128_ones() : vshlq_s64(a, vdupq_n_s64(-c));
+}
+
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -619,6 +835,18 @@
   return vreinterpretq_s64_s32(vshrq_n_s32(vreinterpretq_s32_s64(a), c));
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshlq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(a), c));
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return vshrq_n_s64(a, c);
+}
+
 #else
 
 SIMD_INLINE v128 v128_shl_n_byte(v128 a, unsigned int n) {
@@ -675,6 +903,55 @@
   return v128_shr_s32(a, c);
 }
 
+SIMD_INLINE v128 v128_shl_n_64(v128 a, unsigned int c) {
+  return v128_shl_64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_u64(v128 a, unsigned int c) {
+  return v128_shr_u64(a, c);
+}
+
+SIMD_INLINE v128 v128_shr_n_s64(v128 a, unsigned int c) {
+  return v128_shr_s64(a, c);
+}
+
 #endif
 
+typedef uint32x4_t sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return vdupq_n_u32(0); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+  return vaddq_u32(
+      s, vpaddlq_u16(vsubq_u16(
+             vmaxq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)),
+             vminq_u16(vreinterpretq_u16_s64(a), vreinterpretq_u16_s64(b)))));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  uint64x2_t t = vpaddlq_u32(s);
+  return (uint32_t)(uint64_t)vget_high_u64(t) +
+         (uint32_t)(uint64_t)vget_low_u64(t);
+}
+
+typedef v128 ssd128_internal_s16;
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(
+      s, vreinterpretq_s64_u64(vpaddlq_u32(vreinterpretq_u32_s64(d))));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
+
 #endif /* _V128_INTRINSICS_H */
diff --git a/aom_dsp/simd/v128_intrinsics_c.h b/aom_dsp/simd/v128_intrinsics_c.h
index 237496f..5051e6e 100644
--- a/aom_dsp/simd/v128_intrinsics_c.h
+++ b/aom_dsp/simd/v128_intrinsics_c.h
@@ -78,6 +78,14 @@
   return c_v128_load_unaligned(p);
 }
 
+SIMD_INLINE c_v128 c_v128_load_two_v64(const void *p0, const void *p1) {
+  return c_v128_from_v64(c_v64_load_unaligned(p0), c_v64_load_unaligned(p1));
+}
+
+SIMD_INLINE c_v128 c_v128_load_low_v64(const void *p) {
+  return c_v128_from_v64(c_v64_zero(), c_v64_load_unaligned(p));
+}
+
 SIMD_INLINE void c_v128_store_unaligned(void *p, c_v128 a) {
   uint8_t *pp = (uint8_t *)p;
   uint8_t *q = (uint8_t *)&a;
@@ -117,11 +125,30 @@
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_dup_64(uint64_t x) {
+  c_v128 t;
+  t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v128_dotp_su8(c_v128 a, c_v128 b) {
+  return c_v64_dotp_su8(a.v64[1], b.v64[1]) +
+         c_v64_dotp_su8(a.v64[0], b.v64[0]);
+}
+
 SIMD_INLINE int64_t c_v128_dotp_s16(c_v128 a, c_v128 b) {
   return c_v64_dotp_s16(a.v64[1], b.v64[1]) +
          c_v64_dotp_s16(a.v64[0], b.v64[0]);
 }
 
+SIMD_INLINE int64_t c_v128_dotp_s32(c_v128 a, c_v128 b) {
+  // 32 bit products, 64 bit sum
+  return (int64_t)(int32_t)((int64_t)a.s32[3] * b.s32[3]) +
+         (int64_t)(int32_t)((int64_t)a.s32[2] * b.s32[2]) +
+         (int64_t)(int32_t)((int64_t)a.s32[1] * b.s32[1]) +
+         (int64_t)(int32_t)((int64_t)a.s32[0] * b.s32[0]);
+}
+
 SIMD_INLINE uint64_t c_v128_hadd_u8(c_v128 a) {
   return c_v64_hadd_u8(a.v64[1]) + c_v64_hadd_u8(a.v64[0]);
 }
@@ -188,6 +215,16 @@
                          c_v64_add_16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sadd_u8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_u8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_u8(a.v64[0], b.v64[0]));
+}
+
+SIMD_INLINE c_v128 c_v128_sadd_s8(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_sadd_s8(a.v64[1], b.v64[1]),
+                         c_v64_sadd_s8(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_sadd_s16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sadd_s16(a.v64[1], b.v64[1]),
                          c_v64_sadd_s16(a.v64[0], b.v64[0]));
@@ -198,6 +235,15 @@
                          c_v64_add_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_add_64(c_v128 a, c_v128 b) {
+  // Two complement overflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 > ~b.v64[1].u64 ? a.v64[1].u64 - ~b.v64[1].u64 - 1
+                                   : a.v64[1].u64 + b.v64[1].u64,
+      a.v64[0].u64 > ~b.v64[0].u64 ? a.v64[0].u64 - ~b.v64[0].u64 - 1
+                                   : a.v64[0].u64 + b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_padd_s16(c_v128 a) {
   c_v128 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -207,6 +253,19 @@
   return t;
 }
 
+SIMD_INLINE c_v128 c_v128_padd_u8(c_v128 a) {
+  c_v128 t;
+  t.u16[0] = (uint16_t)a.u8[0] + (uint16_t)a.u8[1];
+  t.u16[1] = (uint16_t)a.u8[2] + (uint16_t)a.u8[3];
+  t.u16[2] = (uint16_t)a.u8[4] + (uint16_t)a.u8[5];
+  t.u16[3] = (uint16_t)a.u8[6] + (uint16_t)a.u8[7];
+  t.u16[4] = (uint16_t)a.u8[8] + (uint16_t)a.u8[9];
+  t.u16[5] = (uint16_t)a.u8[10] + (uint16_t)a.u8[11];
+  t.u16[6] = (uint16_t)a.u8[12] + (uint16_t)a.u8[13];
+  t.u16[7] = (uint16_t)a.u8[14] + (uint16_t)a.u8[15];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_sub_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_sub_8(a.v64[1], b.v64[1]),
                          c_v64_sub_8(a.v64[0], b.v64[0]));
@@ -242,6 +301,15 @@
                          c_v64_sub_32(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_sub_64(c_v128 a, c_v128 b) {
+  // Two complement underflow (silences sanitizers)
+  return c_v128_from_64(
+      a.v64[1].u64 < b.v64[1].u64 ? a.v64[1].u64 + ~b.v64[1].u64 + 1
+                                  : a.v64[1].u64 - b.v64[1].u64,
+      a.v64[0].u64 < b.v64[0].u64 ? a.v64[0].u64 + ~b.v64[0].u64 + 1
+                                  : a.v64[0].u64 - b.v64[0].u64);
+}
+
 SIMD_INLINE c_v128 c_v128_abs_s16(c_v128 a) {
   return c_v128_from_v64(c_v64_abs_s16(a.v64[1]), c_v64_abs_s16(a.v64[0]));
 }
@@ -292,6 +360,11 @@
                          c_v64_rdavg_u8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_rdavg_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_rdavg_u16(a.v64[1], b.v64[1]),
+                         c_v64_rdavg_u16(a.v64[0], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_avg_u16(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_avg_u16(a.v64[1], b.v64[1]),
                          c_v64_avg_u16(a.v64[0], b.v64[0]));
@@ -312,6 +385,22 @@
                          c_v64_min_s8(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE uint32_t c_v128_movemask_8(c_v128 a) {
+  return ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v128 c_v128_blend_8(c_v128 a, c_v128 b, c_v128 c) {
+  c_v128 t;
+  for (int i = 0; i < 16; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_max_s8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_max_s8(a.v64[1], b.v64[1]),
                          c_v64_max_s8(a.v64[0], b.v64[0]));
@@ -327,6 +416,20 @@
                          c_v64_max_s16(a.v64[0], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_max_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? a.s32[c] : b.s32[c];
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_min_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = a.s32[c] > b.s32[c] ? b.s32[c] : a.s32[c];
+  return t;
+}
+
 SIMD_INLINE c_v128 c_v128_ziplo_8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_ziphi_8(a.v64[0], b.v64[0]),
                          c_v64_ziplo_8(a.v64[0], b.v64[0]));
@@ -520,6 +623,11 @@
                          c_v64_pack_s32_s16(b.v64[1], b.v64[0]));
 }
 
+SIMD_INLINE c_v128 c_v128_pack_s32_u16(c_v128 a, c_v128 b) {
+  return c_v128_from_v64(c_v64_pack_s32_u16(a.v64[1], a.v64[0]),
+                         c_v64_pack_s32_u16(b.v64[1], b.v64[0]));
+}
+
 SIMD_INLINE c_v128 c_v128_pack_s16_u8(c_v128 a, c_v128 b) {
   return c_v128_from_v64(c_v64_pack_s16_u8(a.v64[1], a.v64[0]),
                          c_v64_pack_s16_u8(b.v64[1], b.v64[0]));
@@ -561,15 +669,10 @@
 SIMD_INLINE c_v128 c_v128_shuffle_8(c_v128 a, c_v128 pattern) {
   c_v128 t;
   int c;
-  for (c = 0; c < 16; c++) {
-    if (pattern.u8[c] & ~15) {
-      fprintf(stderr, "Undefined v128_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 16; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 15 - (pattern.u8[c] & 15)
                                      : pattern.u8[c] & 15];
-  }
+
   return t;
 }
 
@@ -603,7 +706,28 @@
                          c_v64_cmpeq_16(a.v64[0], b.v64[0]));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_cmpgt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] > b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmplt_s32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] < b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_cmpeq_32(c_v128 a, c_v128 b) {
+  c_v128 t;
+  int c;
+  for (c = 0; c < 4; c++) t.s32[c] = -(a.s32[c] == b.s32[c]);
+  return t;
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_or(c_v64_shl_n_byte(a.v64[1], n),
                                     c_v64_shr_n_byte(a.v64[0], 8 - n)),
@@ -612,7 +736,7 @@
     return c_v128_from_v64(c_v64_shl_n_byte(a.v64[0], n - 8), c_v64_zero());
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_byte(c_v128 a, const unsigned int n) {
   if (n < 8)
     return c_v128_from_v64(c_v64_shr_n_byte(a.v64[1], n),
                            c_v64_or(c_v64_shr_n_byte(a.v64[0], n),
@@ -621,7 +745,7 @@
     return c_v128_from_v64(c_v64_zero(), c_v64_shr_n_byte(a.v64[1], n - 8));
 }
 
-SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_align(c_v128 a, c_v128 b, const unsigned int c) {
   if (SIMD_CHECK && c > 15) {
     fprintf(stderr, "Error: undefined alignment %d\n", c);
     abort();
@@ -630,80 +754,143 @@
            : b;
 }
 
-SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_8(a.v64[1], c), c_v64_shl_8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u8(a.v64[1], c), c_v64_shr_u8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s8(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s8(a.v64[1], c), c_v64_shr_s8(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_16(a.v64[1], c), c_v64_shl_16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u16(a.v64[1], c),
                          c_v64_shr_u16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s16(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s16(a.v64[1], c),
                          c_v64_shr_s16(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shl_32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shl_32(a.v64[1], c), c_v64_shl_32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_u32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_u32(a.v64[1], c),
                          c_v64_shr_u32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, unsigned int c) {
+SIMD_INLINE c_v128 c_v128_shr_s32(c_v128 a, const unsigned int c) {
   return c_v128_from_v64(c_v64_shr_s32(a.v64[1], c),
                          c_v64_shr_s32(a.v64[0], c));
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 <<= c;
+  a.v64[0].u64 <<= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_u64(c_v128 a, const unsigned int c) {
+  a.v64[1].u64 >>= c;
+  a.v64[0].u64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_s64(c_v128 a, const unsigned int c) {
+  a.v64[1].s64 >>= c;
+  a.v64[0].s64 >>= c;
+  return c_v128_from_v64(a.v64[1], a.v64[0]);
+}
+
+SIMD_INLINE c_v128 c_v128_shl_n_8(c_v128 a, const unsigned int n) {
   return c_v128_shl_8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_16(c_v128 a, const unsigned int n) {
   return c_v128_shl_16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_32(c_v128 a, const unsigned int n) {
   return c_v128_shl_32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shl_n_64(c_v128 a, const unsigned int n) {
+  return c_v128_shl_64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_u8(c_v128 a, const unsigned int n) {
   return c_v128_shr_u8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u16(c_v128 a, const unsigned int n) {
   return c_v128_shr_u16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u32(c_v128 a, const unsigned int n) {
   return c_v128_shr_u32(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_u64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_u64(a, n);
+}
+
+SIMD_INLINE c_v128 c_v128_shr_n_s8(c_v128 a, const unsigned int n) {
   return c_v128_shr_s8(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s16(c_v128 a, const unsigned int n) {
   return c_v128_shr_s16(a, n);
 }
 
-SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, unsigned int n) {
+SIMD_INLINE c_v128 c_v128_shr_n_s32(c_v128 a, const unsigned int n) {
   return c_v128_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v128 c_v128_shr_n_s64(c_v128 a, const unsigned int n) {
+  return c_v128_shr_s64(a, n);
+}
+
+typedef uint32_t c_sad128_internal_u16;
+
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE c_sad128_internal_u16 c_v128_sad_u16(c_sad128_internal_u16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v128_sad_u16_sum(c_sad128_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd128_internal_s16;
+
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE c_ssd128_internal_s16 c_v128_ssd_s16(c_ssd128_internal_s16 s,
+                                                 c_v128 a, c_v128 b) {
+  int c;
+  for (c = 0; c < 8; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v128_ssd_s16_sum(c_ssd128_internal_s16 s) { return s; }
+
 #endif /* _V128_INTRINSICS_C_H */
diff --git a/aom_dsp/simd/v128_intrinsics_x86.h b/aom_dsp/simd/v128_intrinsics_x86.h
index c2b078f..72a2261 100644
--- a/aom_dsp/simd/v128_intrinsics_x86.h
+++ b/aom_dsp/simd/v128_intrinsics_x86.h
@@ -62,7 +62,7 @@
 // Some compilers will check this during optimisation, others wont.
 #if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #if defined(__SSSE3__)
-SIMD_INLINE v128 v128_align(v128 a, v128 b, unsigned int c) {
+SIMD_INLINE v128 v128_align(v128 a, v128 b, const unsigned int c) {
   return c ? _mm_alignr_epi8(a, b, c) : b;
 }
 #else
@@ -86,14 +86,25 @@
 
 SIMD_INLINE v128 v128_dup_32(uint32_t x) { return _mm_set1_epi32(x); }
 
+SIMD_INLINE v128 v128_dup_64(uint64_t x) {
+  // _mm_set_pi64x and _mm_cvtsi64x_si64 missing in some compilers
+  return _mm_set_epi32(x >> 32, (uint32_t)x, x >> 32, (uint32_t)x);
+}
+
 SIMD_INLINE v128 v128_add_8(v128 a, v128 b) { return _mm_add_epi8(a, b); }
 
 SIMD_INLINE v128 v128_add_16(v128 a, v128 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v128 v128_sadd_u8(v128 a, v128 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v128 v128_sadd_s8(v128 a, v128 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v128 v128_sadd_s16(v128 a, v128 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v128 v128_add_32(v128 a, v128 b) { return _mm_add_epi32(a, b); }
 
+SIMD_INLINE v128 v128_add_64(v128 a, v128 b) { return _mm_add_epi64(a, b); }
+
 SIMD_INLINE v128 v128_padd_s16(v128 a) {
   return _mm_madd_epi16(a, _mm_set1_epi16(1));
 }
@@ -112,6 +123,8 @@
 
 SIMD_INLINE v128 v128_sub_32(v128 a, v128 b) { return _mm_sub_epi32(a, b); }
 
+SIMD_INLINE v128 v128_sub_64(v128 a, v128 b) { return _mm_sub_epi64(a, b); }
+
 SIMD_INLINE v128 v128_abs_s16(v128 a) {
 #if defined(__SSSE3__)
   return _mm_abs_epi16(a);
@@ -241,6 +254,15 @@
   return _mm_packs_epi32(b, a);
 }
 
+SIMD_INLINE v128 v128_pack_s32_u16(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_packus_epi32(b, a);
+#else
+  return v128_from_v64(v64_pack_s32_u16(v128_high_v64(a), v128_low_v64(a)),
+                       v64_pack_s32_u16(v128_high_v64(b), v128_low_v64(b)));
+#endif
+}
+
 SIMD_INLINE v128 v128_pack_s16_u8(v128 a, v128 b) {
   return _mm_packus_epi16(b, a);
 }
@@ -291,6 +313,15 @@
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_su8(v128 a, v128 b) {
+  v128 t1 = _mm_madd_epi16(v128_unpackhi_s8_s16(a), v128_unpackhi_u8_s16(b));
+  v128 t2 = _mm_madd_epi16(v128_unpacklo_s8_s16(a), v128_unpacklo_u8_s16(b));
+  v128 t = v128_add_32(t1, t2);
+  t = v128_add_32(t, _mm_srli_si128(t, 8));
+  t = v128_add_32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v128_dotp_s16(v128 a, v128 b) {
   v128 r = _mm_madd_epi16(a, b);
 #if defined(__SSE4_1__) && defined(__x86_64__)
@@ -325,31 +356,25 @@
   return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
 }
 
-typedef v128 ssd128_internal;
+typedef int32_t ssd128_internal;
 
-SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return _mm_setzero_si128(); }
+SIMD_INLINE ssd128_internal v128_ssd_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
  * v128_ssd_sum(). */
 SIMD_INLINE ssd128_internal v128_ssd_u8(ssd128_internal s, v128 a, v128 b) {
-  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, _mm_setzero_si128()),
-                         _mm_unpacklo_epi8(b, _mm_setzero_si128()));
-  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, _mm_setzero_si128()),
-                         _mm_unpackhi_epi8(b, _mm_setzero_si128()));
+  v128 z = _mm_setzero_si128();
+  v128 l = _mm_sub_epi16(_mm_unpacklo_epi8(a, z), _mm_unpacklo_epi8(b, z));
+  v128 h = _mm_sub_epi16(_mm_unpackhi_epi8(a, z), _mm_unpackhi_epi8(b, z));
   v128 rl = _mm_madd_epi16(l, l);
   v128 rh = _mm_madd_epi16(h, h);
-  v128 c = _mm_cvtsi32_si128(32);
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 8));
-  rl = _mm_add_epi32(rl, _mm_srli_si128(rl, 4));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 8));
-  rh = _mm_add_epi32(rh, _mm_srli_si128(rh, 4));
-  return _mm_add_epi64(
-      s, _mm_srl_epi64(_mm_sll_epi64(_mm_unpacklo_epi64(rl, rh), c), c));
+  v128 r = _mm_add_epi32(rl, rh);
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 8));
+  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
+  return s + _mm_cvtsi128_si32(r);
 }
 
-SIMD_INLINE uint32_t v128_ssd_u8_sum(ssd128_internal s) {
-  return v128_low_u32(_mm_add_epi32(s, _mm_unpackhi_epi64(s, s)));
-}
+SIMD_INLINE int32_t v128_ssd_u8_sum(ssd128_internal s) { return s; }
 
 SIMD_INLINE v128 v128_or(v128 a, v128 b) { return _mm_or_si128(a, b); }
 
@@ -385,6 +410,14 @@
 #endif
 }
 
+SIMD_INLINE int64_t v128_dotp_s32(v128 a, v128 b) {
+  v128 r = v128_mullo_s32(a, b);
+  return (int64_t)_mm_cvtsi128_si32(r) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(r, 12));
+}
+
 SIMD_INLINE v128 v128_madd_s16(v128 a, v128 b) { return _mm_madd_epi16(a, b); }
 
 SIMD_INLINE v128 v128_madd_us8(v128 a, v128 b) {
@@ -399,6 +432,10 @@
 #endif
 }
 
+SIMD_INLINE v128 v128_padd_u8(v128 a) {
+  return v128_madd_us8(a, _mm_set1_epi8(1));
+}
+
 SIMD_INLINE v128 v128_avg_u8(v128 a, v128 b) { return _mm_avg_epu8(a, b); }
 
 SIMD_INLINE v128 v128_rdavg_u8(v128 a, v128 b) {
@@ -406,6 +443,11 @@
                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_8(1)));
 }
 
+SIMD_INLINE v128 v128_rdavg_u16(v128 a, v128 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v128_dup_16(1)));
+}
+
 SIMD_INLINE v128 v128_avg_u16(v128 a, v128 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v128 v128_min_u8(v128 a, v128 b) { return _mm_min_epu8(a, b); }
@@ -421,6 +463,17 @@
 #endif
 }
 
+SIMD_INLINE uint32_t v128_movemask_8(v128 a) { return _mm_movemask_epi8(a); }
+
+SIMD_INLINE v128 v128_blend_8(v128 a, v128 b, v128 c) {
+#if defined(__SSE4_1__)
+  return _mm_blendv_epi8(a, b, c);
+#else
+  c = _mm_cmplt_epi8(c, v128_zero());
+  return v128_or(v128_and(b, c), v128_andn(a, c));
+#endif
+}
+
 SIMD_INLINE v128 v128_max_s8(v128 a, v128 b) {
 #if defined(__SSE4_1__)
   return _mm_max_epi8(a, b);
@@ -434,6 +487,24 @@
 
 SIMD_INLINE v128 v128_max_s16(v128 a, v128 b) { return _mm_max_epi16(a, b); }
 
+SIMD_INLINE v128 v128_min_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_min_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(a, b);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
+SIMD_INLINE v128 v128_max_s32(v128 a, v128 b) {
+#if defined(__SSE4_1__)
+  return _mm_max_epi32(a, b);
+#else
+  v128 mask = _mm_cmplt_epi32(b, a);
+  return _mm_or_si128(_mm_andnot_si128(mask, b), _mm_and_si128(mask, a));
+#endif
+}
+
 SIMD_INLINE v128 v128_cmpgt_s8(v128 a, v128 b) { return _mm_cmpgt_epi8(a, b); }
 
 SIMD_INLINE v128 v128_cmplt_s8(v128 a, v128 b) { return _mm_cmplt_epi8(a, b); }
@@ -448,6 +519,16 @@
   return _mm_cmplt_epi16(a, b);
 }
 
+SIMD_INLINE v128 v128_cmpeq_32(v128 a, v128 b) { return _mm_cmpeq_epi32(a, b); }
+
+SIMD_INLINE v128 v128_cmpgt_s32(v128 a, v128 b) {
+  return _mm_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v128 v128_cmplt_s32(v128 a, v128 b) {
+  return _mm_cmplt_epi32(a, b);
+}
+
 SIMD_INLINE v128 v128_cmpeq_16(v128 a, v128 b) { return _mm_cmpeq_epi16(a, b); }
 
 SIMD_INLINE v128 v128_shl_8(v128 a, unsigned int c) {
@@ -490,6 +571,21 @@
   return _mm_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v128 v128_shl_64(v128 a, unsigned int c) {
+  return _mm_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_u64(v128 a, unsigned int c) {
+  return _mm_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v128 v128_shr_s64(v128 a, unsigned int c) {
+  // _mm_sra_epi64 is missing in gcc?
+  return v128_from_64((int64_t)v64_u64(v128_high_v64(a)) >> c,
+                      (int64_t)v64_u64(v128_low_v64(a)) >> c);
+  // return _mm_sra_epi64(a, _mm_cvtsi32_si128(c));
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
 #define v128_shl_n_byte(a, c) _mm_slli_si128(a, c)
@@ -507,5 +603,53 @@
 #define v128_shl_n_32(a, c) _mm_slli_epi32(a, c)
 #define v128_shr_n_u32(a, c) _mm_srli_epi32(a, c)
 #define v128_shr_n_s32(a, c) _mm_srai_epi32(a, c)
+#define v128_shl_n_64(a, c) _mm_slli_epi64(a, c)
+#define v128_shr_n_u64(a, c) _mm_srli_epi64(a, c)
+#define v128_shr_n_s64(a, c) \
+  v128_shr_s64(a, c)  // _mm_srai_epi64 missing in gcc?
+
+typedef v128 sad128_internal_u16;
+
+SIMD_INLINE sad128_internal_u16 v128_sad_u16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_sad_u16_sum(). */
+SIMD_INLINE sad128_internal_u16 v128_sad_u16(sad128_internal_u16 s, v128 a,
+                                             v128 b) {
+#if defined(__SSE4_1__)
+  v128 t = v128_sub_16(_mm_max_epu16(a, b), _mm_min_epu16(a, b));
+#else
+  v128 t = v128_cmplt_s16(v128_xor(a, v128_dup_16(32768)),
+                          v128_xor(b, v128_dup_16(32768)));
+  t = v128_sub_16(v128_or(v128_and(b, t), v128_andn(a, t)),
+                  v128_or(v128_and(a, t), v128_andn(b, t)));
+#endif
+  return v128_add_32(
+      s, v128_add_32(v128_unpackhi_u16_s32(t), v128_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v128_sad_u16_sum(sad128_internal_u16 s) {
+  return v128_low_u32(s) + v128_low_u32(v128_shr_n_byte(s, 4)) +
+         v128_low_u32(v128_shr_n_byte(s, 8)) +
+         v128_low_u32(v128_shr_n_byte(s, 12));
+}
+
+typedef v128 ssd128_internal_s16;
+
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16_init() { return v128_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v128_ssd_s16_sum(). */
+SIMD_INLINE ssd128_internal_s16 v128_ssd_s16(ssd128_internal_s16 s, v128 a,
+                                             v128 b) {
+  v128 d = v128_sub_16(a, b);
+  d = v128_madd_s16(d, d);
+  return v128_add_64(s, v128_add_64(_mm_unpackhi_epi32(d, v128_zero()),
+                                    _mm_unpacklo_epi32(d, v128_zero())));
+}
+
+SIMD_INLINE uint64_t v128_ssd_s16_sum(ssd128_internal_s16 s) {
+  return v64_u64(v128_low_v64(s)) + v64_u64(v128_high_v64(s));
+}
 
 #endif /* _V128_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics.h b/aom_dsp/simd/v256_intrinsics.h
index ae31ae7..0e5ae5b 100644
--- a/aom_dsp/simd/v256_intrinsics.h
+++ b/aom_dsp/simd/v256_intrinsics.h
@@ -26,6 +26,7 @@
 
 SIMD_INLINE uint32_t v256_low_u32(v256 a) { return c_v256_low_u32(a); }
 SIMD_INLINE v64 v256_low_v64(v256 a) { return c_v256_low_v64(a); }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return c_v256_low_u64(a); }
 SIMD_INLINE v128 v256_low_v128(v256 a) { return c_v256_low_v128(a); }
 SIMD_INLINE v128 v256_high_v128(v256 a) { return c_v256_high_v128(a); }
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
@@ -60,6 +61,7 @@
 SIMD_INLINE v256 v256_dup_8(uint8_t x) { return c_v256_dup_8(x); }
 SIMD_INLINE v256 v256_dup_16(uint16_t x) { return c_v256_dup_16(x); }
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return c_v256_dup_32(x); }
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return c_v256_dup_64(x); }
 
 typedef uint32_t sad256_internal;
 SIMD_INLINE sad256_internal v256_sad_u8_init() { return c_v256_sad_u8_init(); }
@@ -77,9 +79,16 @@
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
   return c_v256_ssd_u8_sum(s);
 }
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return c_v256_dotp_su8(a, b);
+}
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   return c_v256_dotp_s16(a, b);
 }
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return c_v256_dotp_s32(a, b);
+}
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) { return c_v256_hadd_u8(a); }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) { return c_v256_or(a, b); }
@@ -89,8 +98,13 @@
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return c_v256_add_8(a, b); }
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return c_v256_add_16(a, b); }
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return c_v256_sadd_s8(a, b); }
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return c_v256_sadd_u8(a, b); }
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) { return c_v256_sadd_s16(a, b); }
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return c_v256_add_32(a, b); }
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return c_v256_add_64(a, b); }
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return c_v256_sub_64(a, b); }
+SIMD_INLINE v256 v256_padd_u8(v256 a) { return c_v256_padd_u8(a); }
 SIMD_INLINE v256 v256_padd_s16(v256 a) { return c_v256_padd_s16(a); }
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) { return c_v256_sub_8(a, b); }
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) { return c_v256_ssub_u8(a, b); }
@@ -115,8 +129,16 @@
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) { return c_v256_madd_s16(a, b); }
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) { return c_v256_madd_us8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return c_v256_movemask_8(a); }
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return c_v256_blend_8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) { return c_v256_avg_u8(a, b); }
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) { return c_v256_rdavg_u8(a, b); }
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return c_v256_rdavg_u16(a, b);
+}
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return c_v256_avg_u16(a, b); }
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return c_v256_min_u8(a, b); }
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) { return c_v256_max_u8(a, b); }
@@ -124,6 +146,8 @@
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return c_v256_max_s8(a, b); }
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return c_v256_min_s16(a, b); }
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return c_v256_max_s16(a, b); }
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return c_v256_min_s32(a, b); }
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return c_v256_max_s32(a, b); }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) { return c_v256_ziplo_8(a, b); }
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) { return c_v256_ziphi_8(a, b); }
@@ -160,6 +184,12 @@
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
   return c_v256_unziphi_32(a, b);
 }
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return c_v256_unziplo_64(a, b);
+}
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return c_v256_unziphi_64(a, b);
+}
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) { return c_v256_unpack_u8_s16(a); }
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
   return c_v256_unpacklo_u8_s16(a);
@@ -177,6 +207,9 @@
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
   return c_v256_pack_s32_s16(a, b);
 }
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return c_v256_pack_s32_u16(a, b);
+}
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
   return c_v256_pack_s16_u8(a, b);
 }
@@ -204,6 +237,9 @@
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
   return c_v256_shuffle_8(a, pattern);
 }
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  return c_v256_wideshuffle_8(a, b, pattern);
+}
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return c_v256_pshuffle_8(a, pattern);
 }
@@ -218,7 +254,14 @@
   return c_v256_cmplt_s16(a, b);
 }
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) { return c_v256_cmpeq_16(a, b); }
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) { return c_v256_cmpeq_32(a, b); }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return c_v256_cmpgt_s32(a, b);
+}
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return c_v256_cmplt_s32(a, b);
+}
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return c_v256_shl_8(a, c);
 }
@@ -262,6 +305,9 @@
 SIMD_INLINE v256 v256_shl_n_32(v256 a, unsigned int n) {
   return c_v256_shl_n_32(a, n);
 }
+SIMD_INLINE v256 v256_shl_n_64(v256 a, unsigned int n) {
+  return c_v256_shl_n_64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_u8(v256 a, unsigned int n) {
   return c_v256_shr_n_u8(a, n);
 }
@@ -271,6 +317,9 @@
 SIMD_INLINE v256 v256_shr_n_u32(v256 a, unsigned int n) {
   return c_v256_shr_n_u32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_u64(v256 a, unsigned int n) {
+  return c_v256_shr_n_u64(a, n);
+}
 SIMD_INLINE v256 v256_shr_n_s8(v256 a, unsigned int n) {
   return c_v256_shr_n_s8(a, n);
 }
@@ -280,5 +329,39 @@
 SIMD_INLINE v256 v256_shr_n_s32(v256 a, unsigned int n) {
   return c_v256_shr_n_s32(a, n);
 }
+SIMD_INLINE v256 v256_shr_n_s64(v256 a, unsigned int n) {
+  return c_v256_shr_n_s64(a, n);
+}
+
+SIMD_INLINE v256 v256_shr_n_word(v256 a, unsigned int n) {
+  return c_v256_shr_n_word(a, n);
+}
+SIMD_INLINE v256 v256_shl_n_word(v256 a, unsigned int n) {
+  return c_v256_shl_n_word(a, n);
+}
+
+typedef uint32_t sad256_internal_u16;
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  return c_v256_sad_u16_init();
+}
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  return c_v256_sad_u16(s, a, b);
+}
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return c_v256_sad_u16_sum(s);
+}
+
+typedef uint64_t ssd256_internal_s16;
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  return c_v256_ssd_s16_init();
+}
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  return c_v256_ssd_s16(s, a, b);
+}
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return c_v256_ssd_s16_sum(s);
+}
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v256_intrinsics_c.h b/aom_dsp/simd/v256_intrinsics_c.h
index 890178d..5b412df 100644
--- a/aom_dsp/simd/v256_intrinsics_c.h
+++ b/aom_dsp/simd/v256_intrinsics_c.h
@@ -36,6 +36,8 @@
 
 SIMD_INLINE c_v64 c_v256_low_v64(c_v256 a) { return a.v64[0]; }
 
+SIMD_INLINE uint64_t c_v256_low_u64(c_v256 a) { return a.u64[0]; }
+
 SIMD_INLINE c_v128 c_v256_low_v128(c_v256 a) { return a.v128[0]; }
 
 SIMD_INLINE c_v128 c_v256_high_v128(c_v256 a) { return a.v128[1]; }
@@ -122,23 +124,39 @@
   return t;
 }
 
+SIMD_INLINE c_v256 c_v256_dup_64(uint64_t x) {
+  c_v256 t;
+  t.u64[3] = t.u64[2] = t.u64[1] = t.u64[0] = x;
+  return t;
+}
+
+SIMD_INLINE int64_t c_v256_dotp_su8(c_v256 a, c_v256 b) {
+  return c_v128_dotp_su8(a.v128[1], b.v128[1]) +
+         c_v128_dotp_su8(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE int64_t c_v256_dotp_s16(c_v256 a, c_v256 b) {
   return c_v128_dotp_s16(a.v128[1], b.v128[1]) +
          c_v128_dotp_s16(a.v128[0], b.v128[0]);
 }
 
+SIMD_INLINE int64_t c_v256_dotp_s32(c_v256 a, c_v256 b) {
+  return c_v128_dotp_s32(a.v128[1], b.v128[1]) +
+         c_v128_dotp_s32(a.v128[0], b.v128[0]);
+}
+
 SIMD_INLINE uint64_t c_v256_hadd_u8(c_v256 a) {
   return c_v128_hadd_u8(a.v128[1]) + c_v128_hadd_u8(a.v128[0]);
 }
 
 typedef uint32_t c_sad256_internal;
 
-SIMD_INLINE c_sad128_internal c_v256_sad_u8_init() { return 0; }
+SIMD_INLINE c_sad256_internal c_v256_sad_u8_init() { return 0; }
 
 /* Implementation dependent return value.  Result must be finalised with
    v256_sad_u8_sum().
    The result for more than 16 v256_sad_u8() calls is undefined. */
-SIMD_INLINE c_sad128_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
+SIMD_INLINE c_sad256_internal c_v256_sad_u8(c_sad256_internal s, c_v256 a,
                                             c_v256 b) {
   int c;
   for (c = 0; c < 32; c++)
@@ -193,6 +211,16 @@
                           c_v128_add_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_sadd_s8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_s8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_s8(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sadd_u8(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sadd_u8(a.v128[1], b.v128[1]),
+                          c_v128_sadd_u8(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_sadd_s16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_sadd_s16(a.v128[1], b.v128[1]),
                           c_v128_sadd_s16(a.v128[0], b.v128[0]));
@@ -203,6 +231,23 @@
                           c_v128_add_32(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_add_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_add_64(a.v128[1], b.v128[1]),
+                          c_v128_add_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_sub_64(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_sub_64(a.v128[1], b.v128[1]),
+                          c_v128_sub_64(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_padd_u8(c_v256 a) {
+  c_v256 t;
+  for (int i = 0; i < 16; i++)
+    t.u16[i] = (uint16_t)a.u8[i * 2] + (uint16_t)a.u8[i * 2 + 1];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_padd_s16(c_v256 a) {
   c_v256 t;
   t.s32[0] = (int32_t)a.s16[0] + (int32_t)a.s16[1];
@@ -301,6 +346,11 @@
                           c_v128_rdavg_u8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_rdavg_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_rdavg_u16(a.v128[1], b.v128[1]),
+                          c_v128_rdavg_u16(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_avg_u16(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_avg_u16(a.v128[1], b.v128[1]),
                           c_v128_avg_u16(a.v128[0], b.v128[0]));
@@ -321,6 +371,30 @@
                           c_v128_min_s8(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE uint32_t c_v256_movemask_8(c_v256 a) {
+  return ((a.s8[31] < 0) << 31) | ((a.s8[30] < 0) << 30) |
+         ((a.s8[29] < 0) << 29) | ((a.s8[28] < 0) << 28) |
+         ((a.s8[27] < 0) << 27) | ((a.s8[26] < 0) << 26) |
+         ((a.s8[25] < 0) << 25) | ((a.s8[24] < 0) << 24) |
+         ((a.s8[23] < 0) << 23) | ((a.s8[22] < 0) << 22) |
+         ((a.s8[21] < 0) << 21) | ((a.s8[20] < 0) << 20) |
+         ((a.s8[19] < 0) << 19) | ((a.s8[18] < 0) << 18) |
+         ((a.s8[17] < 0) << 17) | ((a.s8[16] < 0) << 16) |
+         ((a.s8[15] < 0) << 15) | ((a.s8[14] < 0) << 14) |
+         ((a.s8[13] < 0) << 13) | ((a.s8[12] < 0) << 12) |
+         ((a.s8[11] < 0) << 11) | ((a.s8[10] < 0) << 10) |
+         ((a.s8[9] < 0) << 9) | ((a.s8[8] < 0) << 8) | ((a.s8[7] < 0) << 7) |
+         ((a.s8[6] < 0) << 6) | ((a.s8[5] < 0) << 5) | ((a.s8[4] < 0) << 4) |
+         ((a.s8[3] < 0) << 3) | ((a.s8[2] < 0) << 2) | ((a.s8[1] < 0) << 1) |
+         ((a.s8[0] < 0) << 0);
+}
+
+SIMD_INLINE c_v256 c_v256_blend_8(c_v256 a, c_v256 b, c_v256 c) {
+  c_v256 t;
+  for (int i = 0; i < 32; i++) t.u8[i] = c.s8[i] < 0 ? b.u8[i] : a.u8[i];
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_max_s8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_max_s8(a.v128[1], b.v128[1]),
                           c_v128_max_s8(a.v128[0], b.v128[0]));
@@ -336,6 +410,16 @@
                           c_v128_max_s16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_min_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_min_s32(a.v128[1], b.v128[1]),
+                          c_v128_min_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_max_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_max_s32(a.v128[1], b.v128[1]),
+                          c_v128_max_s32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_ziplo_8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_ziphi_8(a.v128[0], b.v128[0]),
                           c_v128_ziplo_8(a.v128[0], b.v128[0]));
@@ -484,6 +568,32 @@
                            : _c_v256_unzip_32(b, a, 1);
 }
 
+SIMD_INLINE c_v256 _c_v256_unzip_64(c_v256 a, c_v256 b, int mode) {
+  c_v256 t;
+  if (mode) {
+    t.u64[3] = b.u64[3];
+    t.u64[2] = b.u64[1];
+    t.u64[1] = a.u64[3];
+    t.u64[0] = a.u64[1];
+  } else {
+    t.u64[3] = a.u64[2];
+    t.u64[2] = a.u64[0];
+    t.u64[1] = b.u64[2];
+    t.u64[0] = b.u64[0];
+  }
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_unziplo_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(a, b, 1)
+                           : _c_v256_unzip_64(a, b, 0);
+}
+
+SIMD_INLINE c_v256 c_v256_unziphi_64(c_v256 a, c_v256 b) {
+  return CONFIG_BIG_ENDIAN ? _c_v256_unzip_64(b, a, 0)
+                           : _c_v256_unzip_64(b, a, 1);
+}
+
 SIMD_INLINE c_v256 c_v256_unpack_u8_s16(c_v128 a) {
   return c_v256_from_v128(c_v128_unpackhi_u8_s16(a), c_v128_unpacklo_u8_s16(a));
 }
@@ -517,6 +627,11 @@
                           c_v128_pack_s32_s16(b.v128[1], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_pack_s32_u16(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_pack_s32_u16(a.v128[1], a.v128[0]),
+                          c_v128_pack_s32_u16(b.v128[1], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_pack_s16_u8(c_v256 a, c_v256 b) {
   return c_v256_from_v128(c_v128_pack_s16_u8(a.v128[1], a.v128[0]),
                           c_v128_pack_s16_u8(b.v128[1], b.v128[0]));
@@ -560,15 +675,21 @@
 SIMD_INLINE c_v256 c_v256_shuffle_8(c_v256 a, c_v256 pattern) {
   c_v256 t;
   int c;
-  for (c = 0; c < 32; c++) {
-    if (pattern.u8[c] & ~31) {
-      fprintf(stderr, "Undefined v256_shuffle_8 index %d/%d\n", pattern.u8[c],
-              c);
-      abort();
-    }
+  for (c = 0; c < 32; c++)
     t.u8[c] = a.u8[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
                                      : pattern.u8[c] & 31];
-  }
+
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_wideshuffle_8(c_v256 a, c_v256 b, c_v256 pattern) {
+  c_v256 t;
+  int c;
+  for (c = 0; c < 32; c++)
+    t.u8[c] = (pattern.u8[c] < 32
+                   ? b.u8
+                   : a.u8)[CONFIG_BIG_ENDIAN ? 31 - (pattern.u8[c] & 31)
+                                             : pattern.u8[c] & 31];
   return t;
 }
 
@@ -609,6 +730,21 @@
                           c_v128_cmpeq_16(a.v128[0], b.v128[0]));
 }
 
+SIMD_INLINE c_v256 c_v256_cmpgt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpgt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmpgt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmplt_s32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmplt_s32(a.v128[1], b.v128[1]),
+                          c_v128_cmplt_s32(a.v128[0], b.v128[0]));
+}
+
+SIMD_INLINE c_v256 c_v256_cmpeq_32(c_v256 a, c_v256 b) {
+  return c_v256_from_v128(c_v128_cmpeq_32(a.v128[1], b.v128[1]),
+                          c_v128_cmpeq_32(a.v128[0], b.v128[0]));
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_byte(c_v256 a, unsigned int n) {
   if (n < 16)
     return c_v256_from_v128(c_v128_or(c_v128_shl_n_byte(a.v128[1], n),
@@ -687,6 +823,45 @@
                           c_v128_shr_s32(a.v128[0], c));
 }
 
+SIMD_INLINE c_v256 c_v256_shr_s64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.s64[3] = a.s64[3] >> n;
+  t.s64[2] = a.s64[2] >> n;
+  t.s64[1] = a.s64[1] >> n;
+  t.s64[0] = a.s64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shr_u64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] >> n;
+  t.u64[2] = a.u64[2] >> n;
+  t.u64[1] = a.u64[1] >> n;
+  t.u64[0] = a.u64[0] >> n;
+  return t;
+}
+
+SIMD_INLINE c_v256 c_v256_shl_64(c_v256 a, unsigned int n) {
+  c_v256 t;
+  if (SIMD_CHECK && n > 63) {
+    fprintf(stderr, "Error: undefined s64 shift right %d\n", n);
+    abort();
+  }
+  t.u64[3] = a.u64[3] << n;
+  t.u64[2] = a.u64[2] << n;
+  t.u64[1] = a.u64[1] << n;
+  t.u64[0] = a.u64[0] << n;
+  return t;
+}
+
 SIMD_INLINE c_v256 c_v256_shl_n_8(c_v256 a, unsigned int n) {
   return c_v256_shl_8(a, n);
 }
@@ -699,6 +874,10 @@
   return c_v256_shl_32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shl_n_64(c_v256 a, unsigned int n) {
+  return c_v256_shl_64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_u8(c_v256 a, unsigned int n) {
   return c_v256_shr_u8(a, n);
 }
@@ -711,6 +890,10 @@
   return c_v256_shr_u32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_u64(c_v256 a, unsigned int n) {
+  return c_v256_shr_u64(a, n);
+}
+
 SIMD_INLINE c_v256 c_v256_shr_n_s8(c_v256 a, unsigned int n) {
   return c_v256_shr_s8(a, n);
 }
@@ -723,4 +906,48 @@
   return c_v256_shr_s32(a, n);
 }
 
+SIMD_INLINE c_v256 c_v256_shr_n_s64(c_v256 a, unsigned int n) {
+  return c_v256_shr_s64(a, n);
+}
+
+SIMD_INLINE c_v256 c_v256_shr_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shr_n_byte(a, 2 * n);
+}
+SIMD_INLINE c_v256 c_v256_shl_n_word(c_v256 a, const unsigned int n) {
+  return c_v256_shl_n_byte(a, 2 * n);
+}
+
+typedef uint32_t c_sad256_internal_u16;
+
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum(). */
+SIMD_INLINE c_sad256_internal_u16 c_v256_sad_u16(c_sad256_internal_u16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += a.u16[c] > b.u16[c] ? a.u16[c] - b.u16[c] : b.u16[c] - a.u16[c];
+  return s;
+}
+
+SIMD_INLINE uint32_t c_v256_sad_u16_sum(c_sad256_internal_u16 s) { return s; }
+
+typedef uint64_t c_ssd256_internal_s16;
+
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16_init() { return 0; }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE c_ssd256_internal_s16 c_v256_ssd_s16(c_ssd256_internal_s16 s,
+                                                 c_v256 a, c_v256 b) {
+  int c;
+  for (c = 0; c < 16; c++)
+    s += (int32_t)(int16_t)(a.s16[c] - b.s16[c]) *
+         (int32_t)(int16_t)(a.s16[c] - b.s16[c]);
+  return s;
+}
+
+SIMD_INLINE uint64_t c_v256_ssd_s16_sum(c_ssd256_internal_s16 s) { return s; }
+
 #endif /* _V256_INTRINSICS_C_H */
diff --git a/aom_dsp/simd/v256_intrinsics_v128.h b/aom_dsp/simd/v256_intrinsics_v128.h
index e6d6746..60b2a17 100644
--- a/aom_dsp/simd/v256_intrinsics_v128.h
+++ b/aom_dsp/simd/v256_intrinsics_v128.h
@@ -20,22 +20,28 @@
 #include "aom_dsp/simd/v128_intrinsics.h"
 #endif
 
+#if HAVE_NEON
+typedef int64x2x2_t v256;
+#else
 typedef struct {
-  v128 lo, hi;
+  v128 val[2];
 } v256;
+#endif
 
-SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.lo); }
+SIMD_INLINE uint32_t v256_low_u32(v256 a) { return v128_low_u32(a.val[0]); }
 
-SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.lo); }
+SIMD_INLINE v64 v256_low_v64(v256 a) { return v128_low_v64(a.val[0]); }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) { return a.lo; }
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
 
-SIMD_INLINE v128 v256_high_v128(v256 a) { return a.hi; }
+SIMD_INLINE v128 v256_low_v128(v256 a) { return a.val[0]; }
+
+SIMD_INLINE v128 v256_high_v128(v256 a) { return a.val[1]; }
 
 SIMD_INLINE v256 v256_from_v128(v128 hi, v128 lo) {
   v256 t;
-  t.hi = hi;
-  t.lo = lo;
+  t.val[1] = hi;
+  t.val[0] = lo;
   return t;
 }
 
@@ -58,13 +64,13 @@
 }
 
 SIMD_INLINE void v256_store_unaligned(void *p, v256 a) {
-  v128_store_unaligned(p, a.lo);
-  v128_store_unaligned((uint8_t *)p + 16, a.hi);
+  v128_store_unaligned(p, a.val[0]);
+  v128_store_unaligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE void v256_store_aligned(void *p, v256 a) {
-  v128_store_aligned(p, a.lo);
-  v128_store_aligned((uint8_t *)p + 16, a.hi);
+  v128_store_aligned(p, a.val[0]);
+  v128_store_aligned((uint8_t *)p + 16, a.val[1]);
 }
 
 SIMD_INLINE v256 v256_zero() {
@@ -86,23 +92,35 @@
   return v256_from_v128(t, t);
 }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) {
+  v128 t = v128_dup_64(x);
+  return v256_from_v128(t, t);
+}
+
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  return v128_dotp_su8(a.val[1], b.val[1]) + v128_dotp_su8(a.val[0], b.val[0]);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
-  return v128_dotp_s16(a.hi, b.hi) + v128_dotp_s16(a.lo, b.lo);
+  return v128_dotp_s16(a.val[1], b.val[1]) + v128_dotp_s16(a.val[0], b.val[0]);
+}
+
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  return v128_dotp_s32(a.val[1], b.val[1]) + v128_dotp_s32(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
-  return v128_hadd_u8(a.hi) + v128_hadd_u8(a.lo);
+  return v128_hadd_u8(a.val[1]) + v128_hadd_u8(a.val[0]);
 }
 
 typedef struct {
-  sad128_internal hi;
-  sad128_internal lo;
+  sad128_internal val[2];
 } sad256_internal;
 
 SIMD_INLINE sad256_internal v256_sad_u8_init() {
   sad256_internal t;
-  t.hi = v128_sad_u8_init();
-  t.lo = v128_sad_u8_init();
+  t.val[1] = v128_sad_u8_init();
+  t.val[0] = v128_sad_u8_init();
   return t;
 }
 
@@ -111,24 +129,23 @@
    The result for more than 16 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   sad256_internal t;
-  t.hi = v128_sad_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_sad_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_sad_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_sad_u8_sum(sad256_internal s) {
-  return v128_sad_u8_sum(s.hi) + v128_sad_u8_sum(s.lo);
+  return v128_sad_u8_sum(s.val[1]) + v128_sad_u8_sum(s.val[0]);
 }
 
 typedef struct {
-  ssd128_internal hi;
-  ssd128_internal lo;
+  ssd128_internal val[2];
 } ssd256_internal;
 
 SIMD_INLINE ssd256_internal v256_ssd_u8_init() {
   ssd256_internal t;
-  t.hi = v128_ssd_u8_init();
-  t.lo = v128_ssd_u8_init();
+  t.val[1] = v128_ssd_u8_init();
+  t.val[0] = v128_ssd_u8_init();
   return t;
 }
 
@@ -136,85 +153,124 @@
  * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   ssd256_internal t;
-  t.hi = v128_ssd_u8(s.hi, a.hi, b.hi);
-  t.lo = v128_ssd_u8(s.lo, a.lo, b.lo);
+  t.val[1] = v128_ssd_u8(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_u8(s.val[0], a.val[0], b.val[0]);
   return t;
 }
 
 SIMD_INLINE uint32_t v256_ssd_u8_sum(ssd256_internal s) {
-  return v128_ssd_u8_sum(s.hi) + v128_ssd_u8_sum(s.lo);
+  return v128_ssd_u8_sum(s.val[1]) + v128_ssd_u8_sum(s.val[0]);
 }
 
 SIMD_INLINE v256 v256_or(v256 a, v256 b) {
-  return v256_from_v128(v128_or(a.hi, b.hi), v128_or(a.lo, b.lo));
+  return v256_from_v128(v128_or(a.val[1], b.val[1]),
+                        v128_or(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_xor(v256 a, v256 b) {
-  return v256_from_v128(v128_xor(a.hi, b.hi), v128_xor(a.lo, b.lo));
+  return v256_from_v128(v128_xor(a.val[1], b.val[1]),
+                        v128_xor(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_and(v256 a, v256 b) {
-  return v256_from_v128(v128_and(a.hi, b.hi), v128_and(a.lo, b.lo));
+  return v256_from_v128(v128_and(a.val[1], b.val[1]),
+                        v128_and(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_andn(v256 a, v256 b) {
-  return v256_from_v128(v128_andn(a.hi, b.hi), v128_andn(a.lo, b.lo));
+  return v256_from_v128(v128_andn(a.val[1], b.val[1]),
+                        v128_andn(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) {
-  return v256_from_v128(v128_add_8(a.hi, b.hi), v128_add_8(a.lo, b.lo));
+  return v256_from_v128(v128_add_8(a.val[1], b.val[1]),
+                        v128_add_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) {
-  return v256_from_v128(v128_add_16(a.hi, b.hi), v128_add_16(a.lo, b.lo));
+  return v256_from_v128(v128_add_16(a.val[1], b.val[1]),
+                        v128_add_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_s8(a.val[1], b.val[1]),
+                        v128_sadd_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) {
+  return v256_from_v128(v128_sadd_u8(a.val[1], b.val[1]),
+                        v128_sadd_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_sadd_s16(a.hi, b.hi), v128_sadd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_sadd_s16(a.val[1], b.val[1]),
+                        v128_sadd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) {
-  return v256_from_v128(v128_add_32(a.hi, b.hi), v128_add_32(a.lo, b.lo));
+  return v256_from_v128(v128_add_32(a.val[1], b.val[1]),
+                        v128_add_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) {
+  return v256_from_v128(v128_add_64(a.val[1], b.val[1]),
+                        v128_add_64(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return v256_from_v128(v128_padd_u8(a.val[1]), v128_padd_u8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
-  return v256_from_v128(v128_padd_s16(a.hi), v128_padd_s16(a.lo));
+  return v256_from_v128(v128_padd_s16(a.val[1]), v128_padd_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_8(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_8(a.hi, b.hi), v128_sub_8(a.lo, b.lo));
+  return v256_from_v128(v128_sub_8(a.val[1], b.val[1]),
+                        v128_sub_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u8(a.hi, b.hi), v128_ssub_u8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u8(a.val[1], b.val[1]),
+                        v128_ssub_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s8(a.hi, b.hi), v128_ssub_s8(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s8(a.val[1], b.val[1]),
+                        v128_ssub_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_16(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_16(a.hi, b.hi), v128_sub_16(a.lo, b.lo));
+  return v256_from_v128(v128_sub_16(a.val[1], b.val[1]),
+                        v128_sub_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_s16(a.hi, b.hi), v128_ssub_s16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_s16(a.val[1], b.val[1]),
+                        v128_ssub_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ssub_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_ssub_u16(a.hi, b.hi), v128_ssub_u16(a.lo, b.lo));
+  return v256_from_v128(v128_ssub_u16(a.val[1], b.val[1]),
+                        v128_ssub_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) {
-  return v256_from_v128(v128_sub_32(a.hi, b.hi), v128_sub_32(a.lo, b.lo));
+  return v256_from_v128(v128_sub_32(a.val[1], b.val[1]),
+                        v128_sub_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) {
+  return v256_from_v128(v128_sub_64(a.val[1], b.val[1]),
+                        v128_sub_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s16(v256 a) {
-  return v256_from_v128(v128_abs_s16(a.hi), v128_abs_s16(a.lo));
+  return v256_from_v128(v128_abs_s16(a.val[1]), v128_abs_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) {
-  return v256_from_v128(v128_abs_s8(a.hi), v128_abs_s8(a.lo));
+  return v256_from_v128(v128_abs_s8(a.val[1]), v128_abs_s8(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_mul_s16(v128 a, v128 b) {
@@ -225,99 +281,146 @@
 }
 
 SIMD_INLINE v256 v256_mullo_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s16(a.hi, b.hi), v128_mullo_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s16(a.val[1], b.val[1]),
+                        v128_mullo_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mulhi_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_mulhi_s16(a.hi, b.hi), v128_mulhi_s16(a.lo, b.lo));
+  return v256_from_v128(v128_mulhi_s16(a.val[1], b.val[1]),
+                        v128_mulhi_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_mullo_s32(v256 a, v256 b) {
-  return v256_from_v128(v128_mullo_s32(a.hi, b.hi), v128_mullo_s32(a.lo, b.lo));
+  return v256_from_v128(v128_mullo_s32(a.val[1], b.val[1]),
+                        v128_mullo_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_s16(a.hi, b.hi), v128_madd_s16(a.lo, b.lo));
+  return v256_from_v128(v128_madd_s16(a.val[1], b.val[1]),
+                        v128_madd_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_madd_us8(v256 a, v256 b) {
-  return v256_from_v128(v128_madd_us8(a.hi, b.hi), v128_madd_us8(a.lo, b.lo));
+  return v256_from_v128(v128_madd_us8(a.val[1], b.val[1]),
+                        v128_madd_us8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u8(a.hi, b.hi), v128_avg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u8(a.val[1], b.val[1]),
+                        v128_avg_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_rdavg_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_rdavg_u8(a.hi, b.hi), v128_rdavg_u8(a.lo, b.lo));
+  return v256_from_v128(v128_rdavg_u8(a.val[1], b.val[1]),
+                        v128_rdavg_u8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_rdavg_u16(a.val[1], b.val[1]),
+                        v128_rdavg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) {
-  return v256_from_v128(v128_avg_u16(a.hi, b.hi), v128_avg_u16(a.lo, b.lo));
+  return v256_from_v128(v128_avg_u16(a.val[1], b.val[1]),
+                        v128_avg_u16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_u8(a.hi, b.hi), v128_min_u8(a.lo, b.lo));
+  return v256_from_v128(v128_min_u8(a.val[1], b.val[1]),
+                        v128_min_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_u8(a.hi, b.hi), v128_max_u8(a.lo, b.lo));
+  return v256_from_v128(v128_max_u8(a.val[1], b.val[1]),
+                        v128_max_u8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s8(a.hi, b.hi), v128_min_s8(a.lo, b.lo));
+  return v256_from_v128(v128_min_s8(a.val[1], b.val[1]),
+                        v128_min_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) {
+  return (v128_movemask_8(v256_high_v128(a)) << 16) |
+         v128_movemask_8(v256_low_v128(a));
+}
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return v256_from_v128(v128_blend_8(a.val[1], b.val[1], c.val[1]),
+                        v128_blend_8(a.val[0], b.val[0], c.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s8(a.hi, b.hi), v128_max_s8(a.lo, b.lo));
+  return v256_from_v128(v128_max_s8(a.val[1], b.val[1]),
+                        v128_max_s8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_min_s16(a.hi, b.hi), v128_min_s16(a.lo, b.lo));
+  return v256_from_v128(v128_min_s16(a.val[1], b.val[1]),
+                        v128_min_s16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_max_s16(a.hi, b.hi), v128_max_s16(a.lo, b.lo));
+  return v256_from_v128(v128_max_s16(a.val[1], b.val[1]),
+                        v128_max_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_min_s32(a.val[1], b.val[1]),
+                        v128_min_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_max_s32(a.val[1], b.val[1]),
+                        v128_max_s32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.lo, b.lo), v128_ziplo_8(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_8(a.val[0], b.val[0]),
+                        v128_ziplo_8(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(a.hi, b.hi), v128_ziplo_8(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_8(a.val[1], b.val[1]),
+                        v128_ziplo_8(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.lo, b.lo), v128_ziplo_16(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_16(a.val[0], b.val[0]),
+                        v128_ziplo_16(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(a.hi, b.hi), v128_ziplo_16(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_16(a.val[1], b.val[1]),
+                        v128_ziplo_16(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.lo, b.lo), v128_ziplo_32(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_32(a.val[0], b.val[0]),
+                        v128_ziplo_32(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(a.hi, b.hi), v128_ziplo_32(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_32(a.val[1], b.val[1]),
+                        v128_ziplo_32(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.lo, b.lo), v128_ziplo_64(a.lo, b.lo));
+  return v256_from_v128(v128_ziphi_64(a.val[0], b.val[0]),
+                        v128_ziplo_64(a.val[0], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(a.hi, b.hi), v128_ziplo_64(a.hi, b.hi));
+  return v256_from_v128(v128_ziphi_64(a.val[1], b.val[1]),
+                        v128_ziplo_64(a.val[1], b.val[1]));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
-  return v256_from_v128(a.lo, b.lo);
+  return v256_from_v128(a.val[0], b.val[0]);
 }
 
 SIMD_INLINE v256 v256_ziphi_128(v256 a, v256 b) {
-  return v256_from_v128(a.hi, b.hi);
+  return v256_from_v128(a.val[1], b.val[1]);
 }
 
 SIMD_INLINE v256 v256_zip_8(v128 a, v128 b) {
@@ -333,31 +436,59 @@
 }
 
 SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(a.hi, a.lo), v128_unziplo_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_8(a.val[1], a.val[0]),
+                        v128_unziplo_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(a.hi, a.lo), v128_unziphi_8(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_8(a.val[1], a.val[0]),
+                        v128_unziphi_8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(a.hi, a.lo),
-                        v128_unziplo_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_16(a.val[1], a.val[0]),
+                        v128_unziplo_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(a.hi, a.lo),
-                        v128_unziphi_16(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_16(a.val[1], a.val[0]),
+                        v128_unziphi_16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(a.hi, a.lo),
-                        v128_unziplo_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziplo_32(a.val[1], a.val[0]),
+                        v128_unziplo_32(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(a.hi, a.lo),
-                        v128_unziphi_32(b.hi, b.lo));
+  return v256_from_v128(v128_unziphi_32(a.val[1], a.val[0]),
+                        v128_unziphi_32(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 0)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 0)));
+#else
+  return v256_from_v64(v128_low_v64(a.val[1]), v128_low_v64(a.val[0]),
+                       v128_low_v64(b.val[1]), v128_low_v64(b.val[0]));
+#endif
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+#if HAVE_SSE2
+  return v256_from_v128(
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(a.val[0]),
+                                      _mm_castsi128_pd(a.val[1]), 3)),
+      _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(b.val[0]),
+                                      _mm_castsi128_pd(b.val[1]), 3)));
+#else
+  return v256_from_v64(v128_high_v64(a.val[1]), v128_high_v64(a.val[0]),
+                       v128_high_v64(b.val[1]), v128_high_v64(b.val[0]));
+#endif
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -365,11 +496,13 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.lo), v128_unpacklo_u8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[0]),
+                        v128_unpacklo_u8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(a.hi), v128_unpacklo_u8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_u8_s16(a.val[1]),
+                        v128_unpacklo_u8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -377,26 +510,33 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.lo), v128_unpacklo_s8_s16(a.lo));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[0]),
+                        v128_unpacklo_s8_s16(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(a.hi), v128_unpacklo_s8_s16(a.hi));
+  return v256_from_v128(v128_unpackhi_s8_s16(a.val[1]),
+                        v128_unpacklo_s8_s16(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(a.hi, a.lo),
-                        v128_pack_s32_s16(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s32_s16(a.val[1], a.val[0]),
+                        v128_pack_s32_s16(b.val[1], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return v256_from_v128(v128_pack_s32_u16(a.val[1], a.val[0]),
+                        v128_pack_s32_u16(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(a.hi, a.lo),
-                        v128_pack_s16_u8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_u8(a.val[1], a.val[0]),
+                        v128_pack_s16_u8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(a.hi, a.lo),
-                        v128_pack_s16_s8(b.hi, b.lo));
+  return v256_from_v128(v128_pack_s16_s8(a.val[1], a.val[0]),
+                        v128_pack_s16_s8(b.val[1], b.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -408,36 +548,172 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.lo),
-                        v128_unpacklo_u16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[0]),
+                        v128_unpacklo_u16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.lo),
-                        v128_unpacklo_s16_s32(a.lo));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[0]),
+                        v128_unpacklo_s16_s32(a.val[0]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(a.hi),
-                        v128_unpacklo_u16_s32(a.hi));
+  return v256_from_v128(v128_unpackhi_u16_s32(a.val[1]),
+                        v128_unpacklo_u16_s32(a.val[1]));
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(a.hi),
-                        v128_unpacklo_s16_s32(a.hi));
+  return v256_from_v128(v128_unpackhi_s16_s32(a.val[1]),
+                        v128_unpacklo_s16_s32(a.val[1]));
 }
 
-SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 maskhi = v128_cmplt_s8(pattern.hi, c16);
-  v128 masklo = v128_cmplt_s8(pattern.lo, c16);
+SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s8(a.val[1], b.val[1]),
+                        v128_cmpgt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s8(a.val[1], b.val[1]),
+                        v128_cmplt_s8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_8(a.val[1], b.val[1]),
+                        v128_cmpeq_8(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s16(a.val[1], b.val[1]),
+                        v128_cmpgt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s16(a.val[1], b.val[1]),
+                        v128_cmplt_s16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_16(a.val[1], b.val[1]),
+                        v128_cmpeq_16(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpgt_s32(a.val[1], b.val[1]),
+                        v128_cmpgt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmplt_s32(a.val[1], b.val[1]),
+                        v128_cmplt_s32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return v256_from_v128(v128_cmpeq_32(a.val[1], b.val[1]),
+                        v128_cmpeq_32(a.val[0], b.val[0]));
+}
+
+SIMD_INLINE v256 v256_shuffle_8(v256 x, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x2_t p = { { vreinterpretq_u8_s64(x.val[0]),
+                       vreinterpretq_u8_s64(x.val[1]) } };
   return v256_from_v128(
-      v128_or(
-          v128_and(v128_shuffle_8(a.lo, pattern.hi), maskhi),
-          v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.hi, c16)), maskhi)),
-      v128_or(v128_and(v128_shuffle_8(a.lo, pattern.lo), masklo),
-              v128_andn(v128_shuffle_8(a.hi, v128_sub_8(pattern.lo, c16)),
-                        masklo)));
+      vreinterpretq_s64_u8(vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl2q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  return v256_from_64(
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+      (uint64_t)vreinterpret_s64_u8(
+          vtbl4_u8(p, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 maskhi = v128_cmplt_s8(pattern.val[1], c16);
+  v128 masklo = v128_cmplt_s8(pattern.val[0], c16);
+  return v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[1]), maskhi),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(x.val[0], pattern.val[0]), masklo));
+#endif
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 x, v256 y, v256 pattern) {
+#if HAVE_NEON
+#if defined(__aarch64__)
+  uint8x16x4_t p = { {
+      vreinterpretq_u8_s64(y.val[0]),
+      vreinterpretq_u8_s64(y.val[1]),
+      vreinterpretq_u8_s64(x.val[0]),
+      vreinterpretq_u8_s64(x.val[1]),
+  } };
+  return v256_from_v128(
+      vreinterpretq_s64_u8(vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[1]))),
+      vreinterpretq_s64_u8(
+          vqtbl4q_u8(p, vreinterpretq_u8_s64(pattern.val[0]))));
+#else
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  uint8x8x4_t p = { { vget_low_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(x.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(x.val[1])) } };
+  uint8x8x4_t q = { { vget_low_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[0])),
+                      vget_low_u8(vreinterpretq_u8_s64(y.val[1])),
+                      vget_high_u8(vreinterpretq_u8_s64(y.val[1])) } };
+  v256 r1 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_high_s64(p32.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       p, vreinterpret_u8_s64(vget_low_s64(p32.val[0])))));
+  v256 r2 =
+      v256_from_64((uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[1])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_high_s64(pattern.val[0])))),
+                   (uint64_t)vreinterpret_s64_u8(vtbl4_u8(
+                       q, vreinterpret_u8_s64(vget_low_s64(pattern.val[0])))));
+  return v256_blend_8(r1, r2, v256_cmplt_s8(pattern, c32));
+#endif
+#else
+  v128 c16 = v128_dup_8(16);
+  v128 c32 = v128_dup_8(32);
+  v128 c48 = v128_dup_8(48);
+  v128 maskhi16 = v128_cmpgt_s8(c16, pattern.val[1]);
+  v128 masklo16 = v128_cmpgt_s8(c16, pattern.val[0]);
+  v128 maskhi48 = v128_cmpgt_s8(c48, pattern.val[1]);
+  v128 masklo48 = v128_cmpgt_s8(c48, pattern.val[0]);
+  v256 r1 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[1], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[1], c32)),
+                   maskhi48),
+      v128_blend_8(v128_shuffle_8(x.val[1], v128_sub_8(pattern.val[0], c48)),
+                   v128_shuffle_8(x.val[0], v128_sub_8(pattern.val[0], c32)),
+                   masklo48));
+  v256 r2 = v256_from_v128(
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[1], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[1]), maskhi16),
+      v128_blend_8(v128_shuffle_8(y.val[1], v128_sub_8(pattern.val[0], c16)),
+                   v128_shuffle_8(y.val[0], pattern.val[0]), masklo16));
+  return v256_blend_8(r1, r2, v256_cmpgt_s8(v256_from_v128(c32, c32), pattern));
+#endif
 }
 
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
@@ -446,104 +722,152 @@
       v128_shuffle_8(v256_low_v128(a), v256_low_v128(pattern)));
 }
 
-SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s8(a.hi, b.hi), v128_cmpgt_s8(a.lo, b.lo));
+SIMD_INLINE v256 v256_shl_8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_8(a.val[1], c), v128_shl_8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s8(a.hi, b.hi), v128_cmplt_s8(a.lo, b.lo));
+SIMD_INLINE v256 v256_shr_u8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u8(a.val[1], c), v128_shr_u8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_8(a.hi, b.hi), v128_cmpeq_8(a.lo, b.lo));
+SIMD_INLINE v256 v256_shr_s8(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s8(a.val[1], c), v128_shr_s8(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_cmpgt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpgt_s16(a.hi, b.hi), v128_cmpgt_s16(a.lo, b.lo));
+SIMD_INLINE v256 v256_shl_16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_16(a.val[1], c), v128_shl_16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmplt_s16(a.hi, b.hi), v128_cmplt_s16(a.lo, b.lo));
+SIMD_INLINE v256 v256_shr_u16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u16(a.val[1], c), v128_shr_u16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
-  return v256_from_v128(v128_cmpeq_16(a.hi, b.hi), v128_cmpeq_16(a.lo, b.lo));
+SIMD_INLINE v256 v256_shr_s16(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s16(a.val[1], c), v128_shr_s16(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_8(a.hi, c), v128_shl_8(a.lo, c));
+SIMD_INLINE v256 v256_shl_32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_32(a.val[1], c), v128_shl_32(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u8(a.hi, c), v128_shr_u8(a.lo, c));
+SIMD_INLINE v256 v256_shr_u32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u32(a.val[1], c), v128_shr_u32(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s8(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s8(a.hi, c), v128_shr_s8(a.lo, c));
+SIMD_INLINE v256 v256_shr_s32(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s32(a.val[1], c), v128_shr_s32(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shl_16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_16(a.hi, c), v128_shl_16(a.lo, c));
+SIMD_INLINE v256 v256_shl_64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shl_64(a.val[1], c), v128_shl_64(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_u16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u16(a.hi, c), v128_shr_u16(a.lo, c));
+SIMD_INLINE v256 v256_shr_u64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_u64(a.val[1], c), v128_shr_u64(a.val[0], c));
 }
 
-SIMD_INLINE v256 v256_shr_s16(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s16(a.hi, c), v128_shr_s16(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shl_32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shl_32(a.hi, c), v128_shl_32(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_u32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_u32(a.hi, c), v128_shr_u32(a.lo, c));
-}
-
-SIMD_INLINE v256 v256_shr_s32(v256 a, unsigned int c) {
-  return v256_from_v128(v128_shr_s32(a.hi, c), v128_shr_s32(a.lo, c));
+SIMD_INLINE v256 v256_shr_s64(v256 a, const unsigned int c) {
+  return v256_from_v128(v128_shr_s64(a.val[1], c), v128_shr_s64(a.val[0], c));
 }
 
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
-#define v256_shl_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.hi, n),                \
-                                     v128_shr_n_byte(a.lo, (16 - (n)) & 31)), \
-                             v128_shl_n_byte(a.lo, (n)))                      \
-            : v256_from_v128(                                                 \
-                  (n) > 16 ? v128_shl_n_byte(a.lo, ((n)-16) & 31) : a.lo,     \
+#define v256_shl_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_or(v128_shl_n_byte(a.val[1], n),         \
+                                     v128_shr_n_byte(a.val[0], 16 - (n))), \
+                             v128_shl_n_byte(a.val[0], (n)))               \
+            : v256_from_v128(                                              \
+                  (n) > 16 ? v128_shl_n_byte(a.val[0], (n)-16) : a.val[0], \
                   v128_zero()))
 
-#define v256_shr_n_byte(a, n)                                                 \
-  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.hi, n),                        \
-                             v128_or(v128_shr_n_byte(a.lo, n),                \
-                                     v128_shl_n_byte(a.hi, (16 - (n)) & 31))) \
-            : v256_from_v128(                                                 \
-                  v128_zero(),                                                \
-                  (n) > 16 ? v128_shr_n_byte(a.hi, ((n)-16) & 31) : a.hi))
+#define v256_shr_n_byte(a, n)                                              \
+  ((n) < 16 ? v256_from_v128(v128_shr_n_byte(a.val[1], n),                 \
+                             v128_or(v128_shr_n_byte(a.val[0], n),         \
+                                     v128_shl_n_byte(a.val[1], 16 - (n)))) \
+            : v256_from_v128(                                              \
+                  v128_zero(),                                             \
+                  (n) > 16 ? v128_shr_n_byte(a.val[1], (n)-16) : a.val[1]))
 
 #define v256_align(a, b, c) \
   ((c) ? v256_or(v256_shr_n_byte(b, c), v256_shl_n_byte(a, 32 - (c))) : b)
 
 #define v256_shl_n_8(a, n) \
-  v256_from_v128(v128_shl_n_8(a.hi, n), v128_shl_n_8(a.lo, n))
+  v256_from_v128(v128_shl_n_8(a.val[1], n), v128_shl_n_8(a.val[0], n))
 #define v256_shl_n_16(a, n) \
-  v256_from_v128(v128_shl_n_16(a.hi, n), v128_shl_n_16(a.lo, n))
+  v256_from_v128(v128_shl_n_16(a.val[1], n), v128_shl_n_16(a.val[0], n))
 #define v256_shl_n_32(a, n) \
-  v256_from_v128(v128_shl_n_32(a.hi, n), v128_shl_n_32(a.lo, n))
+  v256_from_v128(v128_shl_n_32(a.val[1], n), v128_shl_n_32(a.val[0], n))
+#define v256_shl_n_64(a, n) \
+  v256_from_v128(v128_shl_n_64(a.val[1], n), v128_shl_n_64(a.val[0], n))
 #define v256_shr_n_u8(a, n) \
-  v256_from_v128(v128_shr_n_u8(a.hi, n), v128_shr_n_u8(a.lo, n))
+  v256_from_v128(v128_shr_n_u8(a.val[1], n), v128_shr_n_u8(a.val[0], n))
 #define v256_shr_n_u16(a, n) \
-  v256_from_v128(v128_shr_n_u16(a.hi, n), v128_shr_n_u16(a.lo, n))
+  v256_from_v128(v128_shr_n_u16(a.val[1], n), v128_shr_n_u16(a.val[0], n))
 #define v256_shr_n_u32(a, n) \
-  v256_from_v128(v128_shr_n_u32(a.hi, n), v128_shr_n_u32(a.lo, n))
+  v256_from_v128(v128_shr_n_u32(a.val[1], n), v128_shr_n_u32(a.val[0], n))
+#define v256_shr_n_u64(a, n) \
+  v256_from_v128(v128_shr_n_u64(a.val[1], n), v128_shr_n_u64(a.val[0], n))
 #define v256_shr_n_s8(a, n) \
-  v256_from_v128(v128_shr_n_s8(a.hi, n), v128_shr_n_s8(a.lo, n))
+  v256_from_v128(v128_shr_n_s8(a.val[1], n), v128_shr_n_s8(a.val[0], n))
 #define v256_shr_n_s16(a, n) \
-  v256_from_v128(v128_shr_n_s16(a.hi, n), v128_shr_n_s16(a.lo, n))
+  v256_from_v128(v128_shr_n_s16(a.val[1], n), v128_shr_n_s16(a.val[0], n))
 #define v256_shr_n_s32(a, n) \
-  v256_from_v128(v128_shr_n_s32(a.hi, n), v128_shr_n_s32(a.lo, n))
+  v256_from_v128(v128_shr_n_s32(a.val[1], n), v128_shr_n_s32(a.val[0], n))
+#define v256_shr_n_s64(a, n) \
+  v256_from_v128(v128_shr_n_s64(a.val[1], n), v128_shr_n_s64(a.val[0], n))
+
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef struct {
+  sad128_internal_u16 val[2];
+} sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16_init();
+  t.val[0] = v128_sad_u16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+   v256_sad_u16_sum().
+   The result for more than 16 v256_sad_u16() calls is undefined. */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+  sad256_internal_u16 t;
+  t.val[1] = v128_sad_u16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_sad_u16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  return v128_sad_u16_sum(s.val[1]) + v128_sad_u16_sum(s.val[0]);
+}
+
+typedef struct {
+  ssd128_internal_s16 val[2];
+} ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16_init();
+  t.val[0] = v128_ssd_s16_init();
+  return t;
+}
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  ssd256_internal_s16 t;
+  t.val[1] = v128_ssd_s16(s.val[1], a.val[1], b.val[1]);
+  t.val[0] = v128_ssd_s16(s.val[0], a.val[0], b.val[0]);
+  return t;
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  return v128_ssd_s16_sum(s.val[1]) + v128_ssd_s16_sum(s.val[0]);
+}
 
 #endif /* _V256_INTRINSICS_V128_H */
diff --git a/aom_dsp/simd/v256_intrinsics_x86.h b/aom_dsp/simd/v256_intrinsics_x86.h
index 970f174..6bfc0b0 100644
--- a/aom_dsp/simd/v256_intrinsics_x86.h
+++ b/aom_dsp/simd/v256_intrinsics_x86.h
@@ -39,9 +39,9 @@
   return _mm_unpacklo_epi64(_mm256_extracti128_si256(a, 0), v64_zero());
 }
 
-SIMD_INLINE v128 v256_low_v128(v256 a) {
-  return _mm256_extracti128_si256(a, 0);
-}
+SIMD_INLINE uint64_t v256_low_u64(v256 a) { return v64_u64(v256_low_v64(a)); }
+
+SIMD_INLINE v128 v256_low_v128(v256 a) { return _mm256_castsi256_si128(a); }
 
 SIMD_INLINE v128 v256_high_v128(v256 a) {
   return _mm256_extracti128_si256(a, 1);
@@ -49,8 +49,7 @@
 
 SIMD_INLINE v256 v256_from_v128(v128 a, v128 b) {
   // gcc seems to be missing _mm256_set_m128i()
-  return _mm256_insertf128_si256(
-      _mm256_insertf128_si256(_mm256_setzero_si256(), b, 0), a, 1);
+  return _mm256_inserti128_si256(_mm256_castsi128_si256(b), a, 1);
 }
 
 SIMD_INLINE v256 v256_from_v64(v64 a, v64 b, v64 c, v64 d) {
@@ -85,16 +84,28 @@
 
 SIMD_INLINE v256 v256_dup_32(uint32_t x) { return _mm256_set1_epi32(x); }
 
+SIMD_INLINE v256 v256_dup_64(uint64_t x) { return _mm256_set1_epi64x(x); }
+
 SIMD_INLINE v256 v256_add_8(v256 a, v256 b) { return _mm256_add_epi8(a, b); }
 
 SIMD_INLINE v256 v256_add_16(v256 a, v256 b) { return _mm256_add_epi16(a, b); }
 
+SIMD_INLINE v256 v256_sadd_u8(v256 a, v256 b) { return _mm256_adds_epu8(a, b); }
+
+SIMD_INLINE v256 v256_sadd_s8(v256 a, v256 b) { return _mm256_adds_epi8(a, b); }
+
 SIMD_INLINE v256 v256_sadd_s16(v256 a, v256 b) {
   return _mm256_adds_epi16(a, b);
 }
 
 SIMD_INLINE v256 v256_add_32(v256 a, v256 b) { return _mm256_add_epi32(a, b); }
 
+SIMD_INLINE v256 v256_add_64(v256 a, v256 b) { return _mm256_add_epi64(a, b); }
+
+SIMD_INLINE v256 v256_padd_u8(v256 a) {
+  return _mm256_maddubs_epi16(a, _mm256_set1_epi8(1));
+}
+
 SIMD_INLINE v256 v256_padd_s16(v256 a) {
   return _mm256_madd_epi16(a, _mm256_set1_epi16(1));
 }
@@ -117,6 +128,8 @@
 
 SIMD_INLINE v256 v256_sub_32(v256 a, v256 b) { return _mm256_sub_epi32(a, b); }
 
+SIMD_INLINE v256 v256_sub_64(v256 a, v256 b) { return _mm256_sub_epi64(a, b); }
+
 SIMD_INLINE v256 v256_abs_s16(v256 a) { return _mm256_abs_epi16(a); }
 
 SIMD_INLINE v256 v256_abs_s8(v256 a) { return _mm256_abs_epi8(a); }
@@ -126,43 +139,51 @@
 // unpack/pack intrinsics operate on the 256 bit input vector as 2
 // independent 128 bit vectors.
 SIMD_INLINE v256 v256_ziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_8(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_8(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_8(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_16(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_16(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_16(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_32(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_32(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_32(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi32(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_low_v128(a), v256_low_v128(b)),
-                        v128_ziplo_64(v256_low_v128(a), v256_low_v128(b)));
+  return _mm256_unpacklo_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziphi_64(v256 a, v256 b) {
-  return v256_from_v128(v128_ziphi_64(v256_high_v128(a), v256_high_v128(b)),
-                        v128_ziplo_64(v256_high_v128(a), v256_high_v128(b)));
+  return _mm256_unpackhi_epi64(
+      _mm256_permute4x64_epi64(b, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)));
 }
 
 SIMD_INLINE v256 v256_ziplo_128(v256 a, v256 b) {
@@ -185,34 +206,54 @@
   return v256_from_v128(v128_ziphi_32(a, b), v128_ziplo_32(a, b));
 }
 
-SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_8(v256_high_v128(b), v256_low_v128(b)));
-}
-
 SIMD_INLINE v256 v256_unziphi_8(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi16(_mm256_srai_epi16(b, 8), _mm256_srai_epi16(a, 8)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_16(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziplo_8(v256 a, v256 b) {
+  return v256_unziphi_8(_mm256_slli_si256(a, 1), _mm256_slli_si256(b, 1));
 }
 
 SIMD_INLINE v256 v256_unziphi_16(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_16(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(
+      _mm256_packs_epi32(_mm256_srai_epi32(b, 16), _mm256_srai_epi32(a, 16)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
-SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziplo_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziplo_32(v256_high_v128(b), v256_low_v128(b)));
+SIMD_INLINE v256 v256_unziplo_16(v256 a, v256 b) {
+  return v256_unziphi_16(_mm256_slli_si256(a, 2), _mm256_slli_si256(b, 2));
 }
 
 SIMD_INLINE v256 v256_unziphi_32(v256 a, v256 b) {
-  return v256_from_v128(v128_unziphi_32(v256_high_v128(a), v256_low_v128(a)),
-                        v128_unziphi_32(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(3, 1, 3, 1))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_32(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(b),
+                                            _mm256_castsi256_ps(a),
+                                            _MM_SHUFFLE(2, 0, 2, 0))),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziphi_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(b),
+                                            _mm256_castsi256_pd(a), 15)),
+      _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_unziplo_64(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(
+      _mm256_castpd_si256(
+          _mm256_shuffle_pd(_mm256_castsi256_pd(b), _mm256_castsi256_pd(a), 0)),
+      _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u8_s16(v128 a) {
@@ -220,13 +261,15 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_low_v128(a)),
-                        v128_unpacklo_u8_s16(v256_low_v128(a)));
+  return _mm256_unpacklo_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_u8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_u8_s16(v256_high_v128(a)),
-                        v128_unpacklo_u8_s16(v256_high_v128(a)));
+  return _mm256_unpackhi_epi8(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpack_s8_s16(v128 a) {
@@ -234,28 +277,37 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_low_v128(a)),
-                        v128_unpacklo_s8_s16(v256_low_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpacklo_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_unpackhi_s8_s16(v256 a) {
-  return v256_from_v128(v128_unpackhi_s8_s16(v256_high_v128(a)),
-                        v128_unpacklo_s8_s16(v256_high_v128(a)));
+  return _mm256_srai_epi16(
+      _mm256_unpackhi_epi8(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      8);
 }
 
 SIMD_INLINE v256 v256_pack_s32_s16(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s32_s16(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s32_s16(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
+}
+
+SIMD_INLINE v256 v256_pack_s32_u16(v256 a, v256 b) {
+  return _mm256_permute4x64_epi64(_mm256_packus_epi32(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_u8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_u8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_u8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packus_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_pack_s16_s8(v256 a, v256 b) {
-  return v256_from_v128(v128_pack_s16_s8(v256_high_v128(a), v256_low_v128(a)),
-                        v128_pack_s16_s8(v256_high_v128(b), v256_low_v128(b)));
+  return _mm256_permute4x64_epi64(_mm256_packs_epi16(b, a),
+                                  _MM_SHUFFLE(3, 1, 2, 0));
 }
 
 SIMD_INLINE v256 v256_unpack_u16_s32(v128 a) {
@@ -267,43 +319,73 @@
 }
 
 SIMD_INLINE v256 v256_unpacklo_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_low_v128(a)),
-                        v128_unpacklo_u16_s32(v256_low_v128(a)));
+  return _mm256_unpacklo_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpacklo_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_low_v128(a)),
-                        v128_unpacklo_s16_s32(v256_low_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpacklo_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
 
 SIMD_INLINE v256 v256_unpackhi_u16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_u16_s32(v256_high_v128(a)),
-                        v128_unpacklo_u16_s32(v256_high_v128(a)));
+  return _mm256_unpackhi_epi16(
+      _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0)),
+      _mm256_setzero_si256());
 }
 
 SIMD_INLINE v256 v256_unpackhi_s16_s32(v256 a) {
-  return v256_from_v128(v128_unpackhi_s16_s32(v256_high_v128(a)),
-                        v128_unpacklo_s16_s32(v256_high_v128(a)));
+  return _mm256_srai_epi32(
+      _mm256_unpackhi_epi16(
+          a, _mm256_permute4x64_epi64(a, _MM_SHUFFLE(3, 1, 2, 0))),
+      16);
 }
+
 SIMD_INLINE v256 v256_shuffle_8(v256 a, v256 pattern) {
-  v128 c16 = v128_dup_8(16);
-  v128 hi = v256_high_v128(pattern);
-  v128 lo = v256_low_v128(pattern);
-  v128 maskhi = v128_cmplt_s8(hi, c16);
-  v128 masklo = v128_cmplt_s8(lo, c16);
-  return v256_from_v128(
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), hi), maskhi),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(hi, c16)),
-                        maskhi)),
-      v128_or(v128_and(v128_shuffle_8(v256_low_v128(a), lo), masklo),
-              v128_andn(v128_shuffle_8(v256_high_v128(a), v128_sub_8(lo, c16)),
-                        masklo)));
+  return _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 1, 0, 1)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0, 0, 0, 0)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+}
+
+SIMD_INLINE v256 v256_wideshuffle_8(v256 a, v256 b, v256 pattern) {
+  v256 c32 = v256_dup_8(32);
+  v256 p32 = v256_sub_8(pattern, c32);
+  v256 r1 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 1, 0, 1)), p32),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 0, 0, 0)), p32),
+      _mm256_cmpgt_epi8(v256_dup_8(48), pattern));
+  v256 r2 = _mm256_blendv_epi8(
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 3, 0, 3)), pattern),
+      _mm256_shuffle_epi8(
+          _mm256_permute2x128_si256(a, b, _MM_SHUFFLE(0, 2, 0, 2)), pattern),
+      _mm256_cmpgt_epi8(v256_dup_8(16), pattern));
+  return _mm256_blendv_epi8(r1, r2, _mm256_cmpgt_epi8(c32, pattern));
 }
 
 SIMD_INLINE v256 v256_pshuffle_8(v256 a, v256 pattern) {
   return _mm256_shuffle_epi8(a, pattern);
 }
 
+SIMD_INLINE int64_t v256_dotp_su8(v256 a, v256 b) {
+  v256 t1 = _mm256_madd_epi16(v256_unpackhi_s8_s16(a), v256_unpackhi_u8_s16(b));
+  v256 t2 = _mm256_madd_epi16(v256_unpacklo_s8_s16(a), v256_unpacklo_u8_s16(b));
+  t1 = _mm256_add_epi32(t1, t2);
+  v128 t = _mm_add_epi32(_mm256_extracti128_si256(t1, 0),
+                         _mm256_extracti128_si256(t1, 1));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v128_low_u32(t);
+}
+
 SIMD_INLINE int64_t v256_dotp_s16(v256 a, v256 b) {
   v256 r = _mm256_madd_epi16(a, b);
 #if defined(__x86_64__)
@@ -327,6 +409,29 @@
 #endif
 }
 
+SIMD_INLINE int64_t v256_dotp_s32(v256 a, v256 b) {
+  v256 r = _mm256_mullo_epi32(a, b);
+#if defined(__x86_64__)
+  v128 t;
+  r = _mm256_add_epi64(_mm256_cvtepi32_epi64(v256_high_v128(r)),
+                       _mm256_cvtepi32_epi64(v256_low_v128(r)));
+  t = v256_low_v128(_mm256_add_epi64(
+      r, _mm256_permute2x128_si256(r, r, _MM_SHUFFLE(2, 0, 0, 1))));
+  return _mm_cvtsi128_si64(_mm_add_epi64(t, _mm_srli_si128(t, 8)));
+#else
+  v128 l = v256_low_v128(r);
+  v128 h = v256_high_v128(r);
+  return (int64_t)_mm_cvtsi128_si32(l) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(l, 12)) +
+         (int64_t)_mm_cvtsi128_si32(h) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 4)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 8)) +
+         (int64_t)_mm_cvtsi128_si32(_mm_srli_si128(h, 12));
+#endif
+}
+
 SIMD_INLINE uint64_t v256_hadd_u8(v256 a) {
   v256 t = _mm256_sad_epu8(a, _mm256_setzero_si256());
   v128 lo = v256_low_v128(t);
@@ -342,7 +447,7 @@
 }
 
 /* Implementation dependent return value.  Result must be finalised with
-   v256_sad_sum().
+   v256_sad_u8_sum().
    The result for more than 32 v256_sad_u8() calls is undefined. */
 SIMD_INLINE sad256_internal v256_sad_u8(sad256_internal s, v256 a, v256 b) {
   return _mm256_add_epi64(s, _mm256_sad_epu8(a, b));
@@ -360,7 +465,7 @@
 }
 
 /* Implementation dependent return value.  Result must be finalised with
- * v256_ssd_sum(). */
+ * v256_ssd_u8_sum(). */
 SIMD_INLINE ssd256_internal v256_ssd_u8(ssd256_internal s, v256 a, v256 b) {
   v256 l = _mm256_sub_epi16(_mm256_unpacklo_epi8(a, _mm256_setzero_si256()),
                             _mm256_unpacklo_epi8(b, _mm256_setzero_si256()));
@@ -426,6 +531,12 @@
       _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_8(1)));
 }
 
+SIMD_INLINE v256 v256_rdavg_u16(v256 a, v256 b) {
+  return _mm256_sub_epi16(
+      _mm256_avg_epu16(a, b),
+      _mm256_and_si256(_mm256_xor_si256(a, b), v256_dup_16(1)));
+}
+
 SIMD_INLINE v256 v256_avg_u16(v256 a, v256 b) { return _mm256_avg_epu16(a, b); }
 
 SIMD_INLINE v256 v256_min_u8(v256 a, v256 b) { return _mm256_min_epu8(a, b); }
@@ -434,18 +545,28 @@
 
 SIMD_INLINE v256 v256_min_s8(v256 a, v256 b) { return _mm256_min_epi8(a, b); }
 
+SIMD_INLINE uint32_t v256_movemask_8(v256 a) { return _mm256_movemask_epi8(a); }
+
+SIMD_INLINE v256 v256_blend_8(v256 a, v256 b, v256 c) {
+  return _mm256_blendv_epi8(a, b, c);
+}
+
 SIMD_INLINE v256 v256_max_s8(v256 a, v256 b) { return _mm256_max_epi8(a, b); }
 
 SIMD_INLINE v256 v256_min_s16(v256 a, v256 b) { return _mm256_min_epi16(a, b); }
 
 SIMD_INLINE v256 v256_max_s16(v256 a, v256 b) { return _mm256_max_epi16(a, b); }
 
+SIMD_INLINE v256 v256_min_s32(v256 a, v256 b) { return _mm256_min_epi32(a, b); }
+
+SIMD_INLINE v256 v256_max_s32(v256 a, v256 b) { return _mm256_max_epi32(a, b); }
+
 SIMD_INLINE v256 v256_cmpgt_s8(v256 a, v256 b) {
   return _mm256_cmpgt_epi8(a, b);
 }
 
 SIMD_INLINE v256 v256_cmplt_s8(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi8(b, a), _mm256_cmpeq_epi8(b, a));
+  return _mm256_cmpgt_epi8(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_8(v256 a, v256 b) {
@@ -457,13 +578,25 @@
 }
 
 SIMD_INLINE v256 v256_cmplt_s16(v256 a, v256 b) {
-  return v256_andn(_mm256_cmpgt_epi16(b, a), _mm256_cmpeq_epi16(b, a));
+  return _mm256_cmpgt_epi16(b, a);
 }
 
 SIMD_INLINE v256 v256_cmpeq_16(v256 a, v256 b) {
   return _mm256_cmpeq_epi16(a, b);
 }
 
+SIMD_INLINE v256 v256_cmpgt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(a, b);
+}
+
+SIMD_INLINE v256 v256_cmplt_s32(v256 a, v256 b) {
+  return _mm256_cmpgt_epi32(b, a);
+}
+
+SIMD_INLINE v256 v256_cmpeq_32(v256 a, v256 b) {
+  return _mm256_cmpeq_epi32(a, b);
+}
+
 SIMD_INLINE v256 v256_shl_8(v256 a, unsigned int c) {
   return _mm256_and_si256(_mm256_set1_epi8((uint8_t)(0xff << c)),
                           _mm256_sll_epi16(a, _mm_cvtsi32_si128(c)));
@@ -504,6 +637,23 @@
   return _mm256_sra_epi32(a, _mm_cvtsi32_si128(c));
 }
 
+SIMD_INLINE v256 v256_shl_64(v256 a, unsigned int c) {
+  return _mm256_sll_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_u64(v256 a, unsigned int c) {
+  return _mm256_srl_epi64(a, _mm_cvtsi32_si128(c));
+}
+
+SIMD_INLINE v256 v256_shr_s64(v256 a, unsigned int c) {
+#if defined(__AVX512F__)
+  return _mm256_sra_epi64(a, _mm_cvtsi32_si128(c));
+#else
+  return v256_from_v128(v128_shr_s64(v256_high_v128(a), c),
+                        v128_shr_s64(v256_low_v128(a), c));
+#endif
+}
+
 /* These intrinsics require immediate values, so we must use #defines
    to enforce that. */
 // _mm256_slli_si256 works on 128 bit lanes and can't be used
@@ -542,6 +692,59 @@
 #define v256_shl_n_32(a, c) _mm256_slli_epi32(a, c)
 #define v256_shr_n_u32(a, c) _mm256_srli_epi32(a, c)
 #define v256_shr_n_s32(a, c) _mm256_srai_epi32(a, c)
+#define v256_shl_n_64(a, c) _mm256_slli_epi64(a, c)
+#define v256_shr_n_u64(a, c) _mm256_srli_epi64(a, c)
+#define v256_shr_n_s64(a, c) \
+  v256_shr_s64((a), (c))  // _mm256_srai_epi64 broken in gcc?
+#define v256_shr_n_word(a, n) v256_shr_n_byte(a, 2 * (n))
+#define v256_shl_n_word(a, n) v256_shl_n_byte(a, 2 * (n))
+
+typedef v256 sad256_internal_u16;
+
+SIMD_INLINE sad256_internal_u16 v256_sad_u16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_sad_u16_sum(). */
+SIMD_INLINE sad256_internal_u16 v256_sad_u16(sad256_internal_u16 s, v256 a,
+                                             v256 b) {
+#if defined(__SSE4_1__)
+  v256 t = v256_sub_16(_mm256_max_epu16(a, b), _mm256_min_epu16(a, b));
+#else
+  v256 t = v256_cmplt_s16(v256_xor(a, v256_dup_16(32768)),
+                          v256_xor(b, v256_dup_16(32768)));
+  t = v256_sub_16(v256_or(v256_and(b, t), v256_andn(a, t)),
+                  v256_or(v256_and(a, t), v256_andn(b, t)));
+#endif
+  return v256_add_32(
+      s, v256_add_32(v256_unpackhi_u16_s32(t), v256_unpacklo_u16_s32(t)));
+}
+
+SIMD_INLINE uint32_t v256_sad_u16_sum(sad256_internal_u16 s) {
+  v128 t = v128_add_32(v256_high_v128(s), v256_low_v128(s));
+  return v128_low_u32(t) + v128_low_u32(v128_shr_n_byte(t, 4)) +
+         v128_low_u32(v128_shr_n_byte(t, 8)) +
+         v128_low_u32(v128_shr_n_byte(t, 12));
+}
+
+typedef v256 ssd256_internal_s16;
+
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16_init() { return v256_zero(); }
+
+/* Implementation dependent return value.  Result must be finalised with
+ * v256_ssd_s16_sum(). */
+SIMD_INLINE ssd256_internal_s16 v256_ssd_s16(ssd256_internal_s16 s, v256 a,
+                                             v256 b) {
+  v256 d = v256_sub_16(a, b);
+  d = v256_madd_s16(d, d);
+  return v256_add_64(s, v256_add_64(_mm256_unpackhi_epi32(d, v256_zero()),
+                                    _mm256_unpacklo_epi32(d, v256_zero())));
+}
+
+SIMD_INLINE uint64_t v256_ssd_s16_sum(ssd256_internal_s16 s) {
+  v128 t = v128_add_64(v256_high_v128(s), v256_low_v128(s));
+  return v64_u64(v128_low_v64(t)) + v64_u64(v128_high_v64(t));
+}
+
 #endif
 
 #endif /* _V256_INTRINSICS_H */
diff --git a/aom_dsp/simd/v64_intrinsics.h b/aom_dsp/simd/v64_intrinsics.h
index 1d62ae7..6ce53c6 100644
--- a/aom_dsp/simd/v64_intrinsics.h
+++ b/aom_dsp/simd/v64_intrinsics.h
@@ -72,6 +72,8 @@
 
 SIMD_INLINE v64 v64_add_8(v64 a, v64 b) { return c_v64_add_8(a, b); }
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return c_v64_add_16(a, b); }
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return c_v64_sadd_u8(a, b); }
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return c_v64_sadd_s8(a, b); }
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return c_v64_sadd_s16(a, b); }
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return c_v64_add_32(a, b); }
 SIMD_INLINE v64 v64_sub_8(v64 a, v64 b) { return c_v64_sub_8(a, b); }
@@ -101,6 +103,9 @@
 SIMD_INLINE v64 v64_pack_s32_s16(v64 a, v64 b) {
   return c_v64_pack_s32_s16(a, b);
 }
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+  return c_v64_pack_s32_u16(a, b);
+}
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   return c_v64_pack_s16_u8(a, b);
 }
@@ -157,6 +162,7 @@
 
 SIMD_INLINE v64 v64_avg_u8(v64 a, v64 b) { return c_v64_avg_u8(a, b); }
 SIMD_INLINE v64 v64_rdavg_u8(v64 a, v64 b) { return c_v64_rdavg_u8(a, b); }
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) { return c_v64_rdavg_u16(a, b); }
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return c_v64_avg_u16(a, b); }
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return c_v64_min_u8(a, b); }
 SIMD_INLINE v64 v64_max_u8(v64 a, v64 b) { return c_v64_max_u8(a, b); }
diff --git a/aom_dsp/simd/v64_intrinsics_arm.h b/aom_dsp/simd/v64_intrinsics_arm.h
index 180a735..267441b 100644
--- a/aom_dsp/simd/v64_intrinsics_arm.h
+++ b/aom_dsp/simd/v64_intrinsics_arm.h
@@ -122,20 +122,34 @@
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 x, v64 y) {
-  int64x2_t r = vpaddlq_s32(vpaddlq_s16(
+  int16x8_t t =
       vmulq_s16(vmovl_s8(vreinterpret_s8_s64(x)),
-                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))))));
+                vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(y))));
+#if defined(__aarch64__)
+  return vaddlvq_s16(t);
+#else
+  int64x2_t r = vpaddlq_s32(vpaddlq_s16(t));
   return (int64_t)vadd_s64(vget_high_s64(r), vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vaddlvq_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+#else
   int64x2_t r =
       vpaddlq_s32(vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
   return (int64_t)(vget_high_s64(r) + vget_low_s64(r));
+#endif
 }
 
 SIMD_INLINE uint64_t v64_hadd_u8(v64 x) {
+#if defined(__aarch64__)
+  return vaddlv_u8(vreinterpret_u8_s64(x));
+#else
   return (uint64_t)vpaddl_u32(vpaddl_u16(vpaddl_u8(vreinterpret_u8_s64(x))));
+#endif
 }
 
 SIMD_INLINE int64_t v64_hadd_s16(v64 a) {
@@ -146,35 +160,40 @@
 
 SIMD_INLINE sad64_internal v64_sad_u8_init() { return vdupq_n_u16(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
-   v64_sad_u8_sum().
-   The result for more than 32 v64_sad_u8() calls is undefined. */
+// Implementation dependent return value. Result must be finalised with
+// v64_sad_u8_sum().
 SIMD_INLINE sad64_internal v64_sad_u8(sad64_internal s, v64 a, v64 b) {
   return vabal_u8(s, vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
 }
 
 SIMD_INLINE uint32_t v64_sad_u8_sum(sad64_internal s) {
+#if defined(__aarch64__)
+  return vaddlvq_u16(s);
+#else
   uint64x2_t r = vpaddlq_u32(vpaddlq_u16(s));
   return (uint32_t)(uint64_t)(vget_high_u64(r) + vget_low_u64(r));
+#endif
 }
 
-typedef int64x1_t ssd64_internal;
+typedef uint32x4_t ssd64_internal;
 
-SIMD_INLINE ssd64_internal v64_ssd_u8_init() {
-  return (ssd64_internal)(uint64_t)0;
-}
+SIMD_INLINE ssd64_internal v64_ssd_u8_init() { return vdupq_n_u32(0); }
 
-/* Implementation dependent return value.  Result must be finalised with
- * v64_ssd_u8_sum(). */
+// Implementation dependent return value. Result must be finalised with
+// v64_ssd_u8_sum().
 SIMD_INLINE ssd64_internal v64_ssd_u8(ssd64_internal s, v64 a, v64 b) {
   uint8x8_t t = vabd_u8(vreinterpret_u8_s64(a), vreinterpret_u8_s64(b));
-  uint64x2_t r = vpaddlq_u32(vpaddlq_u16(vmull_u8(t, t)));
-  return (ssd64_internal)vadd_u64((uint64x1_t)s,
-                                  vadd_u64(vget_high_u64(r), vget_low_u64(r)));
+  return vaddq_u32(s, vpaddlq_u16(vmull_u8(t, t)));
 }
 
 SIMD_INLINE uint32_t v64_ssd_u8_sum(ssd64_internal s) {
-  return (uint32_t)(uint64_t)s;
+#if defined(__aarch64__)
+  return vaddvq_u32(s);
+#else
+  uint64x2_t t = vpaddlq_u32(s);
+  return vget_lane_u32(
+      vreinterpret_u32_u64(vadd_u64(vget_high_u64(t), vget_low_u64(t))), 0);
+#endif
 }
 
 SIMD_INLINE v64 v64_or(v64 x, v64 y) { return vorr_s64(x, y); }
@@ -190,6 +209,16 @@
       vadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 x, v64 y) {
+  return vreinterpret_s64_u8(
+      vqadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
+}
+
+SIMD_INLINE v64 v64_sadd_s8(v64 x, v64 y) {
+  return vreinterpret_s64_s8(
+      vqadd_s8(vreinterpret_s8_s64(x), vreinterpret_s8_s64(y)));
+}
+
 SIMD_INLINE v64 v64_add_16(v64 x, v64 y) {
   return vreinterpret_s64_s16(
       vadd_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
@@ -254,8 +283,14 @@
 }
 
 SIMD_INLINE v64 v64_mulhi_s16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  int16x8_t t = vreinterpretq_s16_s32(
+      vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)));
+  return vget_low_s64(vreinterpretq_s64_s16(vuzp2q_s16(t, t)));
+#else
   return vreinterpret_s64_s16(vmovn_s32(vshrq_n_s32(
       vmull_s16(vreinterpret_s16_s64(x), vreinterpret_s16_s64(y)), 16)));
+#endif
 }
 
 SIMD_INLINE v64 v64_mullo_s32(v64 x, v64 y) {
@@ -271,10 +306,10 @@
 }
 
 SIMD_INLINE v64 v64_madd_us8(v64 x, v64 y) {
-  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(
-      vaddq_s16(vmull_s8(vadd_s8(vreinterpret_s8_s64(x), vdup_n_s8(-128)),
-                         vreinterpret_s8_s64(y)),
-                vshlq_n_s16(vmovl_s8(vreinterpret_s8_s64(y)), 7)))));
+  int16x8_t t =
+      vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vreinterpret_u8_s64(x))),
+                vmovl_s8(vreinterpret_s8_s64(y)));
+  return vreinterpret_s64_s16(vqmovn_s32(vpaddlq_s16(t)));
 }
 
 SIMD_INLINE v64 v64_avg_u8(v64 x, v64 y) {
@@ -287,6 +322,11 @@
       vhadd_u8(vreinterpret_u8_s64(x), vreinterpret_u8_s64(y)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(
+      vhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 x, v64 y) {
   return vreinterpret_s64_u16(
       vrhadd_u16(vreinterpret_u16_s64(x), vreinterpret_u16_s64(y)));
@@ -323,33 +363,63 @@
 }
 
 SIMD_INLINE v64 v64_ziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vzip2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vzip_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vzip2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   int16x4x2_t r = vzip_s16(vreinterpret_s16_s64(y), vreinterpret_s16_s64(x));
   return vreinterpret_s64_s16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziplo_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip1_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_ziphi_32(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u32(
+      vzip2_u32(vreinterpret_u32_s64(y), vreinterpret_u32_s64(x)));
+#else
   int32x2x2_t r = vzip_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x));
   return vreinterpret_s64_s32(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_u8_s16(v64 a) {
@@ -373,6 +443,11 @@
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 x, v64 y) {
+  return vreinterpret_s64_u16(vqmovun_s32(
+      vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x))));
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 x, v64 y) {
   return vreinterpret_s64_u8(vqmovun_s16(vreinterpretq_s16_s32(
       vcombine_s32(vreinterpret_s32_s64(y), vreinterpret_s32_s64(x)))));
@@ -384,23 +459,43 @@
 }
 
 SIMD_INLINE v64 v64_unziplo_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp1_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_8(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u8(
+      vuzp2_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x)));
+#else
   uint8x8x2_t r = vuzp_u8(vreinterpret_u8_s64(y), vreinterpret_u8_s64(x));
   return vreinterpret_s64_u8(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziplo_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp1_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[0]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unziphi_16(v64 x, v64 y) {
+#if defined(__aarch64__)
+  return vreinterpret_s64_u16(
+      vuzp2_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x)));
+#else
   uint16x4x2_t r = vuzp_u16(vreinterpret_u16_s64(y), vreinterpret_u16_s64(x));
   return vreinterpret_s64_u16(r.val[1]);
+#endif
 }
 
 SIMD_INLINE v64 v64_unpacklo_s16_s32(v64 x) {
diff --git a/aom_dsp/simd/v64_intrinsics_c.h b/aom_dsp/simd/v64_intrinsics_c.h
index b5f8a83..8158899 100644
--- a/aom_dsp/simd/v64_intrinsics_c.h
+++ b/aom_dsp/simd/v64_intrinsics_c.h
@@ -31,13 +31,17 @@
   int64_t s64;
 } c_v64;
 
-SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) { return a.u32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE uint32_t c_v64_low_u32(c_v64 a) {
+  return a.u32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE uint32_t c_v64_high_u32(c_v64 a) {
   return a.u32[!CONFIG_BIG_ENDIAN];
 }
 
-SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) { return a.s32[CONFIG_BIG_ENDIAN]; }
+SIMD_INLINE int32_t c_v64_low_s32(c_v64 a) {
+  return a.s32[!!CONFIG_BIG_ENDIAN];
+}
 
 SIMD_INLINE int32_t c_v64_high_s32(c_v64 a) {
   return a.s32[!CONFIG_BIG_ENDIAN];
@@ -46,7 +50,7 @@
 SIMD_INLINE c_v64 c_v64_from_32(uint32_t x, uint32_t y) {
   c_v64 t;
   t.u32[!CONFIG_BIG_ENDIAN] = x;
-  t.u32[CONFIG_BIG_ENDIAN] = y;
+  t.u32[!!CONFIG_BIG_ENDIAN] = y;
   return t;
 }
 
@@ -178,6 +182,30 @@
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_sadd_u8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.u8[c] = (int16_t)a.u8[c] + (int16_t)b.u8[c] > 255
+                  ? 255
+                  : (int16_t)a.u8[c] + (int16_t)b.u8[c] < 0
+                        ? 0
+                        : (int16_t)a.u8[c] + (int16_t)b.u8[c];
+  return t;
+}
+
+SIMD_INLINE c_v64 c_v64_sadd_s8(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 8; c++)
+    t.s8[c] = (int16_t)a.s8[c] + (int16_t)b.s8[c] > 127
+                  ? 127
+                  : (int16_t)a.s8[c] + (int16_t)b.s8[c] < -128
+                        ? -128
+                        : (int16_t)a.s8[c] + (int16_t)b.s8[c];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_sadd_s16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
@@ -207,8 +235,7 @@
 SIMD_INLINE c_v64 c_v64_ssub_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
-  for (c = 0; c < 8; c++)
-    t.u8[c] = (int32_t)a.u8[c] - (int32_t)b.u8[c] < 0 ? 0 : a.u8[c] - b.u8[c];
+  for (c = 0; c < 8; c++) t.u8[c] = a.u8[c] < b.u8[c] ? 0 : a.u8[c] - b.u8[c];
   return t;
 }
 
@@ -460,6 +487,20 @@
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_pack_s32_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  if (CONFIG_BIG_ENDIAN) {
+    c_v64 u = a;
+    a = b;
+    b = u;
+  }
+  t.u16[3] = a.s32[1] > 65535 ? 65535 : a.s32[1] < 0 ? 0 : a.s32[1];
+  t.u16[2] = a.s32[0] > 65535 ? 65535 : a.s32[0] < 0 ? 0 : a.s32[0];
+  t.u16[1] = b.s32[1] > 65535 ? 65535 : b.s32[1] < 0 ? 0 : b.s32[1];
+  t.u16[0] = b.s32[0] > 65535 ? 65535 : b.s32[0] < 0 ? 0 : b.s32[0];
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_pack_s16_u8(c_v64 a, c_v64 b) {
   c_v64 t;
   if (CONFIG_BIG_ENDIAN) {
@@ -671,6 +712,13 @@
   return t;
 }
 
+SIMD_INLINE c_v64 c_v64_rdavg_u16(c_v64 a, c_v64 b) {
+  c_v64 t;
+  int c;
+  for (c = 0; c < 4; c++) t.u16[c] = (a.u16[c] + b.u16[c]) >> 1;
+  return t;
+}
+
 SIMD_INLINE c_v64 c_v64_avg_u16(c_v64 a, c_v64 b) {
   c_v64 t;
   int c;
diff --git a/aom_dsp/simd/v64_intrinsics_x86.h b/aom_dsp/simd/v64_intrinsics_x86.h
index 8dcc9f6..130052e 100644
--- a/aom_dsp/simd/v64_intrinsics_x86.h
+++ b/aom_dsp/simd/v64_intrinsics_x86.h
@@ -90,8 +90,7 @@
   _mm_storel_epi64((__m128i *)p, a);
 }
 
-// The following function requires an immediate.
-#if defined(__OPTIMIZE__) && __OPTIMIZE__
+#if defined(__OPTIMIZE__) && __OPTIMIZE__ && !defined(__clang__)
 #define v64_align(a, b, c) \
   ((c) ? _mm_srli_si128(_mm_unpacklo_epi64(b, a), (c)) : b)
 #else
@@ -112,6 +111,10 @@
 
 SIMD_INLINE v64 v64_add_16(v64 a, v64 b) { return _mm_add_epi16(a, b); }
 
+SIMD_INLINE v64 v64_sadd_u8(v64 a, v64 b) { return _mm_adds_epu8(a, b); }
+
+SIMD_INLINE v64 v64_sadd_s8(v64 a, v64 b) { return _mm_adds_epi8(a, b); }
+
 SIMD_INLINE v64 v64_sadd_s16(v64 a, v64 b) { return _mm_adds_epi16(a, b); }
 
 SIMD_INLINE v64 v64_add_32(v64 a, v64 b) { return _mm_add_epi32(a, b); }
@@ -170,6 +173,22 @@
   return _mm_packs_epi32(t, t);
 }
 
+SIMD_INLINE v64 v64_pack_s32_u16(v64 a, v64 b) {
+#if defined(__SSE4_1__)
+  __m128i t = _mm_unpacklo_epi64(b, a);
+  return _mm_packus_epi32(t, t);
+#else
+  int32_t ah = v64_high_u32(a);
+  int32_t al = v64_low_u32(a);
+  int32_t bh = v64_high_u32(b);
+  int32_t bl = v64_low_u32(b);
+  return v64_from_16(ah > 65535 ? 65535 : ah < 0 ? 0 : ah,
+                     al > 65535 ? 65535 : al < 0 ? 0 : al,
+                     bh > 65535 ? 65535 : bh < 0 ? 0 : bh,
+                     bl > 65535 ? 65535 : bl < 0 ? 0 : bl);
+#endif
+}
+
 SIMD_INLINE v64 v64_pack_s16_u8(v64 a, v64 b) {
   __m128i t = _mm_unpacklo_epi64(b, a);
   return _mm_packus_epi16(t, t);
@@ -272,14 +291,11 @@
 }
 
 SIMD_INLINE int64_t v64_dotp_su8(v64 a, v64 b) {
-  __m128i r, r1, r2, z;
-  z = _mm_setzero_si128();
-  r1 = _mm_madd_epi16(_mm_slli_epi16(_mm_unpacklo_epi8(a, z), 8),
-                      _mm_unpacklo_epi8(b, z));
-  r2 = _mm_srli_si128(r1, 8);
-  r = _mm_add_epi32(r1, r2);
-  r = _mm_add_epi32(r, _mm_srli_si128(r, 4));
-  return ((int32_t)v64_low_u32(r)) >> 8;
+  __m128i t = _mm_madd_epi16(_mm_srai_epi16(_mm_unpacklo_epi8(a, a), 8),
+                             _mm_unpacklo_epi8(b, _mm_setzero_si128()));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 8));
+  t = _mm_add_epi32(t, _mm_srli_si128(t, 4));
+  return (int32_t)v64_low_u32(t);
 }
 
 SIMD_INLINE int64_t v64_dotp_s16(v64 a, v64 b) {
@@ -371,6 +387,11 @@
                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_8(1)));
 }
 
+SIMD_INLINE v64 v64_rdavg_u16(v64 a, v64 b) {
+  return _mm_sub_epi16(_mm_avg_epu16(a, b),
+                       _mm_and_si128(_mm_xor_si128(a, b), v64_dup_16(1)));
+}
+
 SIMD_INLINE v64 v64_avg_u16(v64 a, v64 b) { return _mm_avg_epu16(a, b); }
 
 SIMD_INLINE v64 v64_min_u8(v64 a, v64 b) { return _mm_min_epu8(a, b); }
diff --git a/test/simd_cmp_impl.h b/test/simd_cmp_impl.h
index 6c79cbc..386efba 100644
--- a/test/simd_cmp_impl.h
+++ b/test/simd_cmp_impl.h
@@ -181,6 +181,18 @@
   return v128_shr_n_s32(a, shift);
 }
 template <int shift>
+v128 imm_v128_shl_n_64(v128 a) {
+  return v128_shl_n_64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_u64(v128 a) {
+  return v128_shr_n_u64(a, shift);
+}
+template <int shift>
+v128 imm_v128_shr_n_s64(v128 a) {
+  return v128_shr_n_s64(a, shift);
+}
+template <int shift>
 v128 imm_v128_align(v128 a, v128 b) {
   return v128_align(a, b, shift);
 }
@@ -230,6 +242,18 @@
   return c_v128_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v128 c_imm_v128_shl_n_64(c_v128 a) {
+  return c_v128_shl_n_64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_u64(c_v128 a) {
+  return c_v128_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v128 c_imm_v128_shr_n_s64(c_v128 a) {
+  return c_v128_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v128 c_imm_v128_align(c_v128 a, c_v128 b) {
   return c_v128_align(a, b, shift);
 }
@@ -279,6 +303,18 @@
   return v256_shr_n_s32(a, shift);
 }
 template <int shift>
+v256 imm_v256_shl_n_64(v256 a) {
+  return v256_shl_n_64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_u64(v256 a) {
+  return v256_shr_n_u64(a, shift);
+}
+template <int shift>
+v256 imm_v256_shr_n_s64(v256 a) {
+  return v256_shr_n_s64(a, shift);
+}
+template <int shift>
 v256 imm_v256_align(v256 a, v256 b) {
   return v256_align(a, b, shift);
 }
@@ -328,6 +364,18 @@
   return c_v256_shr_n_s32(a, shift);
 }
 template <int shift>
+c_v256 c_imm_v256_shl_n_64(c_v256 a) {
+  return c_v256_shl_n_64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_u64(c_v256 a) {
+  return c_v256_shr_n_u64(a, shift);
+}
+template <int shift>
+c_v256 c_imm_v256_shr_n_s64(c_v256 a) {
+  return c_v256_shr_n_s64(a, shift);
+}
+template <int shift>
 c_v256 c_imm_v256_align(c_v256 a, c_v256 b) {
   return c_v256_align(a, b, shift);
 }
@@ -358,6 +406,18 @@
 uint32_t c_v128_ssd_u8(c_v128 a, c_v128 b) {
   return c_v128_ssd_u8_sum(::c_v128_ssd_u8(c_v128_ssd_u8_init(), a, b));
 }
+uint32_t v128_sad_u16(v128 a, v128 b) {
+  return v128_sad_u16_sum(::v128_sad_u16(v128_sad_u16_init(), a, b));
+}
+uint64_t v128_ssd_s16(v128 a, v128 b) {
+  return v128_ssd_s16_sum(::v128_ssd_s16(v128_ssd_s16_init(), a, b));
+}
+uint32_t c_v128_sad_u16(c_v128 a, c_v128 b) {
+  return c_v128_sad_u16_sum(::c_v128_sad_u16(c_v128_sad_u16_init(), a, b));
+}
+uint64_t c_v128_ssd_s16(c_v128 a, c_v128 b) {
+  return c_v128_ssd_s16_sum(::c_v128_ssd_s16(c_v128_ssd_s16_init(), a, b));
+}
 uint32_t v256_sad_u8(v256 a, v256 b) {
   return v256_sad_u8_sum(::v256_sad_u8(v256_sad_u8_init(), a, b));
 }
@@ -370,6 +430,18 @@
 uint32_t c_v256_ssd_u8(c_v256 a, c_v256 b) {
   return c_v256_ssd_u8_sum(::c_v256_ssd_u8(c_v256_ssd_u8_init(), a, b));
 }
+uint32_t v256_sad_u16(v256 a, v256 b) {
+  return v256_sad_u16_sum(::v256_sad_u16(v256_sad_u16_init(), a, b));
+}
+uint64_t v256_ssd_s16(v256 a, v256 b) {
+  return v256_ssd_s16_sum(::v256_ssd_s16(v256_ssd_s16_init(), a, b));
+}
+uint32_t c_v256_sad_u16(c_v256 a, c_v256 b) {
+  return c_v256_sad_u16_sum(::c_v256_sad_u16(c_v256_sad_u16_init(), a, b));
+}
+uint64_t c_v256_ssd_s16(c_v256 a, c_v256 b) {
+  return c_v256_ssd_s16_sum(::c_v256_ssd_s16(c_v256_ssd_s16_init(), a, b));
+}
 
 namespace {
 
@@ -391,6 +463,8 @@
                       MAP(v64_ssd_u8),
                       MAP(v64_add_8),
                       MAP(v64_add_16),
+                      MAP(v64_sadd_s8),
+                      MAP(v64_sadd_u8),
                       MAP(v64_sadd_s16),
                       MAP(v64_add_32),
                       MAP(v64_sub_8),
@@ -406,6 +480,7 @@
                       MAP(v64_ziphi_16),
                       MAP(v64_ziplo_32),
                       MAP(v64_ziphi_32),
+                      MAP(v64_pack_s32_u16),
                       MAP(v64_pack_s32_s16),
                       MAP(v64_pack_s16_u8),
                       MAP(v64_pack_s16_s8),
@@ -424,6 +499,7 @@
                       MAP(v64_madd_us8),
                       MAP(v64_avg_u8),
                       MAP(v64_rdavg_u8),
+                      MAP(v64_rdavg_u16),
                       MAP(v64_avg_u16),
                       MAP(v64_min_u8),
                       MAP(v64_max_u8),
@@ -564,10 +640,15 @@
                       MAP(v64_from_16),
                       MAP(v128_sad_u8),
                       MAP(v128_ssd_u8),
+                      MAP(v128_sad_u16),
+                      MAP(v128_ssd_s16),
                       MAP(v128_add_8),
                       MAP(v128_add_16),
+                      MAP(v128_sadd_s8),
+                      MAP(v128_sadd_u8),
                       MAP(v128_sadd_s16),
                       MAP(v128_add_32),
+                      MAP(v128_add_64),
                       MAP(v128_sub_8),
                       MAP(v128_ssub_u8),
                       MAP(v128_ssub_s8),
@@ -575,6 +656,7 @@
                       MAP(v128_ssub_s16),
                       MAP(v128_ssub_u16),
                       MAP(v128_sub_32),
+                      MAP(v128_sub_64),
                       MAP(v128_ziplo_8),
                       MAP(v128_ziphi_8),
                       MAP(v128_ziplo_16),
@@ -589,6 +671,7 @@
                       MAP(v128_unziplo_16),
                       MAP(v128_unziphi_32),
                       MAP(v128_unziplo_32),
+                      MAP(v128_pack_s32_u16),
                       MAP(v128_pack_s32_s16),
                       MAP(v128_pack_s16_u8),
                       MAP(v128_pack_s16_s8),
@@ -603,6 +686,7 @@
                       MAP(v128_madd_us8),
                       MAP(v128_avg_u8),
                       MAP(v128_rdavg_u8),
+                      MAP(v128_rdavg_u16),
                       MAP(v128_avg_u16),
                       MAP(v128_min_u8),
                       MAP(v128_max_u8),
@@ -610,12 +694,17 @@
                       MAP(v128_max_s8),
                       MAP(v128_min_s16),
                       MAP(v128_max_s16),
+                      MAP(v128_min_s32),
+                      MAP(v128_max_s32),
                       MAP(v128_cmpgt_s8),
                       MAP(v128_cmplt_s8),
                       MAP(v128_cmpeq_8),
                       MAP(v128_cmpgt_s16),
                       MAP(v128_cmpeq_16),
                       MAP(v128_cmplt_s16),
+                      MAP(v128_cmpgt_s32),
+                      MAP(v128_cmpeq_32),
+                      MAP(v128_cmplt_s32),
                       MAP(v128_shuffle_8),
                       MAP(imm_v128_align<1>),
                       MAP(imm_v128_align<2>),
@@ -634,6 +723,7 @@
                       MAP(imm_v128_align<15>),
                       MAP(v128_abs_s8),
                       MAP(v128_abs_s16),
+                      MAP(v128_padd_u8),
                       MAP(v128_padd_s16),
                       MAP(v128_unpacklo_u16_s32),
                       MAP(v128_unpacklo_s16_s32),
@@ -738,6 +828,54 @@
                       MAP(imm_v128_shr_n_s32<20>),
                       MAP(imm_v128_shr_n_s32<24>),
                       MAP(imm_v128_shr_n_s32<28>),
+                      MAP(imm_v128_shl_n_64<1>),
+                      MAP(imm_v128_shl_n_64<4>),
+                      MAP(imm_v128_shl_n_64<8>),
+                      MAP(imm_v128_shl_n_64<12>),
+                      MAP(imm_v128_shl_n_64<16>),
+                      MAP(imm_v128_shl_n_64<20>),
+                      MAP(imm_v128_shl_n_64<24>),
+                      MAP(imm_v128_shl_n_64<28>),
+                      MAP(imm_v128_shl_n_64<32>),
+                      MAP(imm_v128_shl_n_64<36>),
+                      MAP(imm_v128_shl_n_64<40>),
+                      MAP(imm_v128_shl_n_64<44>),
+                      MAP(imm_v128_shl_n_64<48>),
+                      MAP(imm_v128_shl_n_64<52>),
+                      MAP(imm_v128_shl_n_64<56>),
+                      MAP(imm_v128_shl_n_64<60>),
+                      MAP(imm_v128_shr_n_u64<1>),
+                      MAP(imm_v128_shr_n_u64<4>),
+                      MAP(imm_v128_shr_n_u64<8>),
+                      MAP(imm_v128_shr_n_u64<12>),
+                      MAP(imm_v128_shr_n_u64<16>),
+                      MAP(imm_v128_shr_n_u64<20>),
+                      MAP(imm_v128_shr_n_u64<24>),
+                      MAP(imm_v128_shr_n_u64<28>),
+                      MAP(imm_v128_shr_n_u64<32>),
+                      MAP(imm_v128_shr_n_u64<36>),
+                      MAP(imm_v128_shr_n_u64<40>),
+                      MAP(imm_v128_shr_n_u64<44>),
+                      MAP(imm_v128_shr_n_u64<48>),
+                      MAP(imm_v128_shr_n_u64<52>),
+                      MAP(imm_v128_shr_n_u64<56>),
+                      MAP(imm_v128_shr_n_u64<60>),
+                      MAP(imm_v128_shr_n_s64<1>),
+                      MAP(imm_v128_shr_n_s64<4>),
+                      MAP(imm_v128_shr_n_s64<8>),
+                      MAP(imm_v128_shr_n_s64<12>),
+                      MAP(imm_v128_shr_n_s64<16>),
+                      MAP(imm_v128_shr_n_s64<20>),
+                      MAP(imm_v128_shr_n_s64<24>),
+                      MAP(imm_v128_shr_n_s64<28>),
+                      MAP(imm_v128_shr_n_s64<32>),
+                      MAP(imm_v128_shr_n_s64<36>),
+                      MAP(imm_v128_shr_n_s64<40>),
+                      MAP(imm_v128_shr_n_s64<44>),
+                      MAP(imm_v128_shr_n_s64<48>),
+                      MAP(imm_v128_shr_n_s64<52>),
+                      MAP(imm_v128_shr_n_s64<56>),
+                      MAP(imm_v128_shr_n_s64<60>),
                       MAP(v128_from_v64),
                       MAP(v128_zip_8),
                       MAP(v128_zip_16),
@@ -756,21 +894,29 @@
                       MAP(v128_shl_32),
                       MAP(v128_shr_u32),
                       MAP(v128_shr_s32),
+                      MAP(v128_shl_64),
+                      MAP(v128_shr_u64),
+                      MAP(v128_shr_s64),
                       MAP(v128_hadd_u8),
+                      MAP(v128_dotp_su8),
                       MAP(v128_dotp_s16),
+                      MAP(v128_dotp_s32),
                       MAP(v128_low_u32),
                       MAP(v128_low_v64),
                       MAP(v128_high_v64),
                       MAP(v128_from_64),
                       MAP(v128_from_32),
+                      MAP(v128_movemask_8),
                       MAP(v128_zero),
                       MAP(v128_dup_8),
                       MAP(v128_dup_16),
                       MAP(v128_dup_32),
+                      MAP(v128_dup_64),
                       MAP(v128_unpacklo_u8_s16),
                       MAP(v128_unpackhi_u8_s16),
                       MAP(v128_unpacklo_s8_s16),
                       MAP(v128_unpackhi_s8_s16),
+                      MAP(v128_blend_8),
                       MAP(u32_load_unaligned),
                       MAP(u32_store_unaligned),
                       MAP(v64_load_unaligned),
@@ -779,12 +925,20 @@
                       MAP(v128_store_unaligned),
                       MAP(v256_sad_u8),
                       MAP(v256_ssd_u8),
+                      MAP(v256_sad_u16),
+                      MAP(v256_ssd_s16),
                       MAP(v256_hadd_u8),
+                      MAP(v256_low_u64),
+                      MAP(v256_dotp_su8),
                       MAP(v256_dotp_s16),
+                      MAP(v256_dotp_s32),
                       MAP(v256_add_8),
                       MAP(v256_add_16),
+                      MAP(v256_sadd_s8),
+                      MAP(v256_sadd_u8),
                       MAP(v256_sadd_s16),
                       MAP(v256_add_32),
+                      MAP(v256_add_64),
                       MAP(v256_sub_8),
                       MAP(v256_ssub_u8),
                       MAP(v256_ssub_s8),
@@ -792,6 +946,7 @@
                       MAP(v256_ssub_u16),
                       MAP(v256_ssub_s16),
                       MAP(v256_sub_32),
+                      MAP(v256_sub_64),
                       MAP(v256_ziplo_8),
                       MAP(v256_ziphi_8),
                       MAP(v256_ziplo_16),
@@ -806,6 +961,9 @@
                       MAP(v256_unziplo_16),
                       MAP(v256_unziphi_32),
                       MAP(v256_unziplo_32),
+                      MAP(v256_unziphi_64),
+                      MAP(v256_unziplo_64),
+                      MAP(v256_pack_s32_u16),
                       MAP(v256_pack_s32_s16),
                       MAP(v256_pack_s16_u8),
                       MAP(v256_pack_s16_s8),
@@ -820,6 +978,7 @@
                       MAP(v256_madd_us8),
                       MAP(v256_avg_u8),
                       MAP(v256_rdavg_u8),
+                      MAP(v256_rdavg_u16),
                       MAP(v256_avg_u16),
                       MAP(v256_min_u8),
                       MAP(v256_max_u8),
@@ -827,14 +986,20 @@
                       MAP(v256_max_s8),
                       MAP(v256_min_s16),
                       MAP(v256_max_s16),
+                      MAP(v256_min_s32),
+                      MAP(v256_max_s32),
                       MAP(v256_cmpgt_s8),
                       MAP(v256_cmplt_s8),
                       MAP(v256_cmpeq_8),
                       MAP(v256_cmpgt_s16),
                       MAP(v256_cmplt_s16),
                       MAP(v256_cmpeq_16),
+                      MAP(v256_cmpgt_s32),
+                      MAP(v256_cmplt_s32),
+                      MAP(v256_cmpeq_32),
                       MAP(v256_shuffle_8),
                       MAP(v256_pshuffle_8),
+                      MAP(v256_wideshuffle_8),
                       MAP(imm_v256_align<1>),
                       MAP(imm_v256_align<2>),
                       MAP(imm_v256_align<3>),
@@ -884,8 +1049,12 @@
                       MAP(v256_shl_32),
                       MAP(v256_shr_u32),
                       MAP(v256_shr_s32),
+                      MAP(v256_shl_64),
+                      MAP(v256_shr_u64),
+                      MAP(v256_shr_s64),
                       MAP(v256_abs_s8),
                       MAP(v256_abs_s16),
+                      MAP(v256_padd_u8),
                       MAP(v256_padd_s16),
                       MAP(v256_unpacklo_u16_s32),
                       MAP(v256_unpacklo_s16_s32),
@@ -1022,10 +1191,60 @@
                       MAP(imm_v256_shr_n_s32<20>),
                       MAP(imm_v256_shr_n_s32<24>),
                       MAP(imm_v256_shr_n_s32<28>),
+                      MAP(imm_v256_shl_n_64<1>),
+                      MAP(imm_v256_shl_n_64<4>),
+                      MAP(imm_v256_shl_n_64<8>),
+                      MAP(imm_v256_shl_n_64<12>),
+                      MAP(imm_v256_shl_n_64<16>),
+                      MAP(imm_v256_shl_n_64<20>),
+                      MAP(imm_v256_shl_n_64<24>),
+                      MAP(imm_v256_shl_n_64<28>),
+                      MAP(imm_v256_shl_n_64<32>),
+                      MAP(imm_v256_shl_n_64<36>),
+                      MAP(imm_v256_shl_n_64<40>),
+                      MAP(imm_v256_shl_n_64<44>),
+                      MAP(imm_v256_shl_n_64<48>),
+                      MAP(imm_v256_shl_n_64<52>),
+                      MAP(imm_v256_shl_n_64<56>),
+                      MAP(imm_v256_shl_n_64<60>),
+                      MAP(imm_v256_shr_n_u64<1>),
+                      MAP(imm_v256_shr_n_u64<4>),
+                      MAP(imm_v256_shr_n_u64<8>),
+                      MAP(imm_v256_shr_n_u64<12>),
+                      MAP(imm_v256_shr_n_u64<16>),
+                      MAP(imm_v256_shr_n_u64<20>),
+                      MAP(imm_v256_shr_n_u64<24>),
+                      MAP(imm_v256_shr_n_u64<28>),
+                      MAP(imm_v256_shr_n_u64<32>),
+                      MAP(imm_v256_shr_n_u64<36>),
+                      MAP(imm_v256_shr_n_u64<40>),
+                      MAP(imm_v256_shr_n_u64<44>),
+                      MAP(imm_v256_shr_n_u64<48>),
+                      MAP(imm_v256_shr_n_u64<52>),
+                      MAP(imm_v256_shr_n_u64<56>),
+                      MAP(imm_v256_shr_n_u64<60>),
+                      MAP(imm_v256_shr_n_s64<1>),
+                      MAP(imm_v256_shr_n_s64<4>),
+                      MAP(imm_v256_shr_n_s64<8>),
+                      MAP(imm_v256_shr_n_s64<12>),
+                      MAP(imm_v256_shr_n_s64<16>),
+                      MAP(imm_v256_shr_n_s64<20>),
+                      MAP(imm_v256_shr_n_s64<24>),
+                      MAP(imm_v256_shr_n_s64<28>),
+                      MAP(imm_v256_shr_n_s64<32>),
+                      MAP(imm_v256_shr_n_s64<36>),
+                      MAP(imm_v256_shr_n_s64<40>),
+                      MAP(imm_v256_shr_n_s64<44>),
+                      MAP(imm_v256_shr_n_s64<48>),
+                      MAP(imm_v256_shr_n_s64<52>),
+                      MAP(imm_v256_shr_n_s64<56>),
+                      MAP(imm_v256_shr_n_s64<60>),
+                      MAP(v256_movemask_8),
                       MAP(v256_zero),
                       MAP(v256_dup_8),
                       MAP(v256_dup_16),
                       MAP(v256_dup_32),
+                      MAP(v256_dup_64),
                       MAP(v256_low_u32),
                       MAP(v256_low_v64),
                       MAP(v256_from_64),
@@ -1036,6 +1255,7 @@
                       MAP(v256_unpackhi_u8_s16),
                       MAP(v256_unpacklo_s8_s16),
                       MAP(v256_unpackhi_s8_s16),
+                      MAP(v256_blend_8),
                       { NULL, NULL, NULL } };
 #undef MAP
 
@@ -1052,7 +1272,7 @@
   *simd = m[i].simd;
 }
 
-// Used for printing errors in TestSimd1Arg and TestSimd2Args
+// Used for printing errors in TestSimd1Arg, TestSimd2Args and TestSimd3Args
 std::string Print(const uint8_t *a, int size) {
   std::string text = "0x";
   for (int i = 0; i < size; i++) {
@@ -1065,7 +1285,8 @@
   return text;
 }
 
-// Used in TestSimd1Arg and TestSimd2Args to restrict argument ranges
+// Used in TestSimd1Arg, TestSimd2Args and TestSimd3Args to restrict argument
+// ranges
 void SetMask(uint8_t *s, int size, uint32_t mask, uint32_t maskwidth) {
   switch (maskwidth) {
     case 0: {
@@ -1143,16 +1364,16 @@
   return *(reinterpret_cast<const uint8_t *>(p));
 }
 
-// CompareSimd1Arg and CompareSimd2Args compare intrinsics taking 1 or
-// 2 arguments respectively with their corresponding C reference.
-// Ideally, the loads and stores should have gone into the template
-// parameter list, but v64 and v128 could be typedef'ed to the same
-// type (which is the case on x86) and then we can't instantiate both
-// v64 and v128, so the function return and argument types, including
-// the always differing types in the C equivalent are used instead.
-// The function arguments must be void pointers and then go through a
-// cast to avoid matching errors in the branches eliminated by the
-// typeid tests in the calling function.
+// CompareSimd1Arg, CompareSimd2Args and CompareSimd3Args compare
+// intrinsics taking 1, 2 or 3 arguments respectively with their
+// corresponding C reference.  Ideally, the loads and stores should
+// have gone into the template parameter list, but v64 and v128 could
+// be typedef'ed to the same type (which is the case on x86) and then
+// we can't instantiate both v64 and v128, so the function return and
+// argument types, including the always differing types in the C
+// equivalent are used instead.  The function arguments must be void
+// pointers and then go through a cast to avoid matching errors in the
+// branches eliminated by the typeid tests in the calling function.
 template <typename Ret, typename Arg, typename CRet, typename CArg>
 int CompareSimd1Arg(fptr store, fptr load, fptr simd, void *d, fptr c_store,
                     fptr c_load, fptr c_simd, void *ref_d, const void *a) {
@@ -1195,6 +1416,35 @@
   return memcmp(ref_d, d, sizeof(CRet));
 }
 
+template <typename Ret, typename Arg1, typename Arg2, typename Arg3,
+          typename CRet, typename CArg1, typename CArg2, typename CArg3>
+int CompareSimd3Args(fptr store, fptr load1, fptr load2, fptr load3, fptr simd,
+                     void *d, fptr c_store, fptr c_load1, fptr c_load2,
+                     fptr c_load3, fptr c_simd, void *ref_d, const void *a,
+                     const void *b, const void *c) {
+  void (*const my_store)(void *, Ret) = (void (*const)(void *, Ret))store;
+  Arg1 (*const my_load1)(const void *) = (Arg1(*const)(const void *))load1;
+  Arg2 (*const my_load2)(const void *) = (Arg2(*const)(const void *))load2;
+  Arg3 (*const my_load3)(const void *) = (Arg3(*const)(const void *))load3;
+  Ret (*const my_simd)(Arg1, Arg2, Arg3) = (Ret(*const)(Arg1, Arg2, Arg3))simd;
+  void (*const my_c_store)(void *, CRet) = (void (*const)(void *, CRet))c_store;
+  CArg1 (*const my_c_load1)(const void *) =
+      (CArg1(*const)(const void *))c_load1;
+  CArg2 (*const my_c_load2)(const void *) =
+      (CArg2(*const)(const void *))c_load2;
+  CArg2 (*const my_c_load3)(const void *) =
+      (CArg2(*const)(const void *))c_load3;
+  CRet (*const my_c_simd)(CArg1, CArg2, CArg3) =
+      (CRet(*const)(CArg1, CArg2, CArg3))c_simd;
+
+  // Call reference and intrinsic
+  my_c_store(ref_d, my_c_simd(my_c_load1(a), my_c_load2(b), my_c_load3(c)));
+  my_store(d, my_simd(my_load1(a), my_load2(b), my_load3(c)));
+
+  // Compare results
+  return memcmp(ref_d, d, sizeof(CRet));
+}
+
 }  // namespace
 
 template <typename CRet, typename CArg>
@@ -1357,6 +1607,14 @@
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v128_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v128) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V128_U64
+      error = CompareSimd1Arg<v128, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v128_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v128_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(c_v256) &&
                typeid(CArg) == typeid(c_v256)) {
       // V256_V256
@@ -1397,6 +1655,14 @@
           reinterpret_cast<fptr>(u32_load_aligned), simd, d,
           reinterpret_cast<fptr>(c_v256_store_aligned),
           reinterpret_cast<fptr>(c_u32_load_aligned), ref_simd, ref_d, s);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg) == typeid(uint64_t)) {
+      // V256_U64
+      error = CompareSimd1Arg<v256, uint64_t, CRet, CArg>(
+          reinterpret_cast<fptr>(v256_store_aligned),
+          reinterpret_cast<fptr>(u64_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_v256_store_aligned),
+          reinterpret_cast<fptr>(c_u64_load_aligned), ref_simd, ref_d, s);
     } else if (typeid(CRet) == typeid(uint32_t) &&
                typeid(CArg) == typeid(c_v256)) {
       // U32_V256
@@ -1535,6 +1801,18 @@
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(c_v128_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v128) &&
+               typeid(CArg2) == typeid(c_v128)) {
+      // U64_V128V128
+      error = CompareSimd2Args<uint64_t, v128, v128, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned),
+          reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(c_v128_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v128) &&
                typeid(CArg2) == typeid(c_v128)) {
@@ -1595,6 +1873,18 @@
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(c_v256_load_aligned),
           reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
+    } else if (typeid(CRet) == typeid(uint64_t) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256)) {
+      // U64_V256V256
+      error = CompareSimd2Args<uint64_t, v256, v256, CRet, CArg1, CArg2>(
+          reinterpret_cast<fptr>(u64_store_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned),
+          reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+          reinterpret_cast<fptr>(c_u64_store_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(c_v256_load_aligned),
+          reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2);
     } else if (typeid(CRet) == typeid(int64_t) &&
                typeid(CArg1) == typeid(c_v256) &&
                typeid(CArg2) == typeid(c_v256)) {
@@ -1657,6 +1947,81 @@
                       << Print(ref_d, sizeof(ref_d)) << " (ref)";
 }
 
+template <typename CRet, typename CArg1, typename CArg2, typename CArg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name) {
+  ACMRandom rnd(ACMRandom::DeterministicSeed());
+  fptr ref_simd;
+  fptr simd;
+  int error = 0;
+  DECLARE_ALIGNED(32, uint8_t, s1[sizeof(CArg1)]);
+  DECLARE_ALIGNED(32, uint8_t, s2[sizeof(CArg2)]);
+  DECLARE_ALIGNED(32, uint8_t, s3[sizeof(CArg3)]);
+  DECLARE_ALIGNED(32, uint8_t, d[sizeof(CRet)]);
+  DECLARE_ALIGNED(32, uint8_t, ref_d[sizeof(CRet)]);
+  memset(ref_d, 0, sizeof(ref_d));
+  memset(d, 0, sizeof(d));
+
+  Map(name, &ref_simd, &simd);
+  if (simd == NULL || ref_simd == NULL) {
+    FAIL() << "Internal error: Unknown intrinsic function " << name;
+  }
+
+  for (unsigned int count = 0;
+       count < iterations && !error && !testing::Test::HasFailure(); count++) {
+    for (unsigned int c = 0; c < sizeof(CArg1); c++) s1[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg2); c++) s2[c] = rnd.Rand8();
+
+    for (unsigned int c = 0; c < sizeof(CArg3); c++) s3[c] = rnd.Rand8();
+
+    if (maskwidth) SetMask(s3, sizeof(CArg3), mask, maskwidth);
+
+    if (typeid(CRet) == typeid(c_v128) && typeid(CArg1) == typeid(c_v128) &&
+        typeid(CArg2) == typeid(c_v128) && typeid(CArg3) == typeid(c_v128)) {
+      // V128_V128V128V128
+      error =
+          CompareSimd3Args<v128, v128, v128, v128, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v128_store_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned),
+              reinterpret_cast<fptr>(v128_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v128_store_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(c_v128_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else if (typeid(CRet) == typeid(c_v256) &&
+               typeid(CArg1) == typeid(c_v256) &&
+               typeid(CArg2) == typeid(c_v256) &&
+               typeid(CArg3) == typeid(c_v256)) {
+      // V256_V256V256V256
+      error =
+          CompareSimd3Args<v256, v256, v256, v256, CRet, CArg1, CArg2, CArg3>(
+              reinterpret_cast<fptr>(v256_store_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned),
+              reinterpret_cast<fptr>(v256_load_aligned), simd, d,
+              reinterpret_cast<fptr>(c_v256_store_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(c_v256_load_aligned),
+              reinterpret_cast<fptr>(ref_simd), ref_d, s1, s2, s3);
+    } else {
+      FAIL() << "Internal error: Unknown intrinsic function "
+             << typeid(CRet).name() << " " << name << "("
+             << typeid(CArg1).name() << ", " << typeid(CArg2).name() << ", "
+             << typeid(CArg3).name() << ")";
+    }
+  }
+
+  EXPECT_EQ(0, error) << "Error: mismatch for " << name << "("
+                      << Print(s1, sizeof(s1)) << ", " << Print(s2, sizeof(s2))
+                      << ", " << Print(s3, sizeof(s3)) << ") -> "
+                      << Print(d, sizeof(d)) << " (simd), "
+                      << Print(ref_d, sizeof(ref_d)) << " (ref)";
+}
+
 // Instantiations to make the functions callable from another files
 template void TestSimd1Arg<c_v64, uint8_t>(uint32_t, uint32_t, uint32_t,
                                            const char *);
@@ -1692,6 +2057,8 @@
                                              const char *);
 template void TestSimd1Arg<c_v128, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v128, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<c_v128, c_v64>(uint32_t, uint32_t, uint32_t,
                                           const char *);
 template void TestSimd1Arg<uint32_t, c_v128>(uint32_t, uint32_t, uint32_t,
@@ -1708,10 +2075,15 @@
                                                         uint32_t, const char *);
 template void TestSimd2Args<c_v128, c_v64, c_v64>(uint32_t, uint32_t, uint32_t,
                                                   const char *);
+template void TestSimd2Args<uint64_t, c_v128, c_v128>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v128, c_v128>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 template void TestSimd1Arg<c_v256, c_v128>(uint32_t, uint32_t, uint32_t,
                                            const char *);
 template void TestSimd1Arg<c_v256, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1724,6 +2096,8 @@
                                              const char *);
 template void TestSimd1Arg<c_v256, uint32_t>(uint32_t, uint32_t, uint32_t,
                                              const char *);
+template void TestSimd1Arg<c_v256, uint64_t>(uint32_t, uint32_t, uint32_t,
+                                             const char *);
 template void TestSimd1Arg<uint32_t, c_v256>(uint32_t, uint32_t, uint32_t,
                                              const char *);
 template void TestSimd1Arg<c_v64, c_v256>(uint32_t, uint32_t, uint32_t,
@@ -1734,9 +2108,14 @@
                                                     uint32_t, const char *);
 template void TestSimd2Args<c_v256, c_v256, uint32_t>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd2Args<uint64_t, c_v256, c_v256>(uint32_t, uint32_t,
+                                                      uint32_t, const char *);
 template void TestSimd2Args<int64_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                      uint32_t, const char *);
 template void TestSimd2Args<uint32_t, c_v256, c_v256>(uint32_t, uint32_t,
                                                       uint32_t, const char *);
+template void TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(uint32_t, uint32_t,
+                                                            uint32_t,
+                                                            const char *);
 
 }  // namespace SIMD_NAMESPACE
diff --git a/test/simd_impl.h b/test/simd_impl.h
index ed04b88..4190d73 100644
--- a/test/simd_impl.h
+++ b/test/simd_impl.h
@@ -61,23 +61,29 @@
 TYPEDEF_SIMD(V128_U8);
 TYPEDEF_SIMD(V128_U16);
 TYPEDEF_SIMD(V128_U32);
+TYPEDEF_SIMD(V128_U64);
 TYPEDEF_SIMD(V128_U64U64);
 TYPEDEF_SIMD(V128_V64V64);
 TYPEDEF_SIMD(V128_V128V128);
+TYPEDEF_SIMD(V128_V128V128V128);
 TYPEDEF_SIMD(S64_V128V128);
 TYPEDEF_SIMD(V128_V128U32);
 TYPEDEF_SIMD(U32_V128V128);
+TYPEDEF_SIMD(U64_V128V128);
 TYPEDEF_SIMD(V256_V128);
 TYPEDEF_SIMD(V256_V256);
 TYPEDEF_SIMD(U64_V256);
 TYPEDEF_SIMD(V256_V128V128);
 TYPEDEF_SIMD(V256_V256V256);
+TYPEDEF_SIMD(V256_V256V256V256);
+TYPEDEF_SIMD(U64_V256V256);
 TYPEDEF_SIMD(S64_V256V256);
 TYPEDEF_SIMD(V256_V256U32);
 TYPEDEF_SIMD(U32_V256V256);
 TYPEDEF_SIMD(V256_U8);
 TYPEDEF_SIMD(V256_U16);
 TYPEDEF_SIMD(V256_U32);
+TYPEDEF_SIMD(V256_U64);
 TYPEDEF_SIMD(U32_V256);
 TYPEDEF_SIMD(V64_V256);
 
@@ -86,9 +92,11 @@
 typedef ARCH_POSTFIX(V64_V64V64) ARCH_POSTFIX(V64_V64V64_Part2);
 typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part2);
 typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part3);
+typedef ARCH_POSTFIX(V128_V128) ARCH_POSTFIX(V128_V128_Part4);
 typedef ARCH_POSTFIX(V128_V128V128) ARCH_POSTFIX(V128_V128V128_Part2);
 typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part2);
 typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part3);
+typedef ARCH_POSTFIX(V256_V256) ARCH_POSTFIX(V256_V256_Part4);
 typedef ARCH_POSTFIX(V256_V256V256) ARCH_POSTFIX(V256_V256V256_Part2);
 
 // These functions are machine tuned located elsewhere
@@ -100,6 +108,10 @@
 void TestSimd2Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
                    const char *name);
 
+template <typename c_ret, typename c_arg1, typename c_arg2, typename c_arg3>
+void TestSimd3Args(uint32_t iterations, uint32_t mask, uint32_t maskwidth,
+                   const char *name);
+
 const int kIterations = 65536;
 
 // Add a macro layer since TEST_P will quote the name so we need to
@@ -195,6 +207,10 @@
   TestSimd1Arg<c_v128, uint32_t>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v128, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V128_V64), TestIntrinsics) {
   TestSimd1Arg<c_v128, c_v64>(kIterations, mask, maskwidth, name);
 }
@@ -203,10 +219,19 @@
   TestSimd2Args<c_v128, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_V128V128V128), TestIntrinsics) {
+  TestSimd3Args<c_v128, c_v128, c_v128, c_v128>(kIterations, mask, maskwidth,
+                                                name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U32_V128V128), TestIntrinsics) {
   TestSimd2Args<uint32_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(U64_V128V128), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(S64_V128V128), TestIntrinsics) {
   TestSimd2Args<int64_t, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
@@ -235,6 +260,10 @@
   TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V128_V128_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v128, c_v128>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U64_V256), TestIntrinsics) {
   TestSimd1Arg<uint64_t, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -251,6 +280,11 @@
   TestSimd2Args<c_v256, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_V256V256V256), TestIntrinsics) {
+  TestSimd3Args<c_v256, c_v256, c_v256, c_v256>(kIterations, mask, maskwidth,
+                                                name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V256_V128V128), TestIntrinsics) {
   TestSimd2Args<c_v256, c_v128, c_v128>(kIterations, mask, maskwidth, name);
 }
@@ -259,6 +293,10 @@
   TestSimd2Args<uint32_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(U64_V256V256), TestIntrinsics) {
+  TestSimd2Args<uint64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(S64_V256V256), TestIntrinsics) {
   TestSimd2Args<int64_t, c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -279,6 +317,10 @@
   TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_V256_Part4), TestIntrinsics) {
+  TestSimd1Arg<c_v256, c_v256>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(V256_U8), TestIntrinsics) {
   TestSimd1Arg<c_v256, uint8_t>(kIterations, mask, maskwidth, name);
 }
@@ -291,6 +333,10 @@
   TestSimd1Arg<c_v256, uint32_t>(kIterations, mask, maskwidth, name);
 }
 
+MY_TEST_P(ARCH_POSTFIX(V256_U64), TestIntrinsics) {
+  TestSimd1Arg<c_v256, uint64_t>(kIterations, mask, maskwidth, name);
+}
+
 MY_TEST_P(ARCH_POSTFIX(U32_V256), TestIntrinsics) {
   TestSimd1Arg<uint32_t, c_v256>(kIterations, mask, maskwidth, name);
 }
@@ -339,6 +385,8 @@
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V64_V64V64_Part2), SIMD_TUPLE(v64_shuffle_8, 7U, 8U),
+    SIMD_TUPLE(v64_pack_s32_u16, 0U, 0U), SIMD_TUPLE(v64_rdavg_u16, 0U, 0U),
+    SIMD_TUPLE(v64_sadd_s8, 0U, 0U), SIMD_TUPLE(v64_sadd_u8, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<1>, 0U, 0U), SIMD_TUPLE(imm_v64_align<2>, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<3>, 0U, 0U), SIMD_TUPLE(imm_v64_align<4>, 0U, 0U),
     SIMD_TUPLE(imm_v64_align<5>, 0U, 0U), SIMD_TUPLE(imm_v64_align<6>, 0U, 0U),
@@ -470,7 +518,8 @@
 INSTANTIATE(ARCH, ARCH_POSTFIX(V64_U32U32), SIMD_TUPLE(v64_from_32, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128V128), SIMD_TUPLE(v128_sad_u8, 0U, 0U),
-            SIMD_TUPLE(v128_ssd_u8, 0U, 0U));
+            SIMD_TUPLE(v128_ssd_u8, 0U, 0U), SIMD_TUPLE(v128_sad_u16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128V128), SIMD_TUPLE(v128_ssd_s16, 0U, 0U));
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V128_V128V128), SIMD_TUPLE(v128_add_8, 0U, 0U),
@@ -501,9 +550,16 @@
     SIMD_TUPLE(v128_cmpgt_s16, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128_Part2),
-            SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
+            SIMD_TUPLE(v128_pack_s32_u16, 0U, 0U),
+            SIMD_TUPLE(v128_rdavg_u16, 0U, 0U), SIMD_TUPLE(v128_add_64, 0U, 0U),
+            SIMD_TUPLE(v128_sub_64, 0U, 0U), SIMD_TUPLE(v128_sadd_s8, 0U, 0U),
+            SIMD_TUPLE(v128_sadd_u8, 0U, 0U), SIMD_TUPLE(v128_cmpeq_16, 0U, 0U),
             SIMD_TUPLE(v128_cmplt_s16, 0U, 0U),
+            SIMD_TUPLE(v128_cmplt_s32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpeq_32, 0U, 0U),
+            SIMD_TUPLE(v128_cmpgt_s32, 0U, 0U),
             SIMD_TUPLE(v128_shuffle_8, 15U, 8U),
+            SIMD_TUPLE(v128_min_s32, 0U, 0U), SIMD_TUPLE(v128_max_s32, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<1>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<2>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<3>, 0U, 0U),
@@ -520,6 +576,9 @@
             SIMD_TUPLE(imm_v128_align<14>, 0U, 0U),
             SIMD_TUPLE(imm_v128_align<15>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128V128V128),
+            SIMD_TUPLE(v128_blend_8, 0U, 0U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128), SIMD_TUPLE(v128_abs_s8, 0U, 0U),
             SIMD_TUPLE(v128_abs_s16, 0U, 0U), SIMD_TUPLE(v128_padd_s16, 0U, 0U),
             SIMD_TUPLE(v128_unpacklo_u8_s16, 0U, 0U),
@@ -634,6 +693,57 @@
             SIMD_TUPLE(imm_v128_shr_n_s32<24>, 0U, 0U),
             SIMD_TUPLE(imm_v128_shr_n_s32<28>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128_Part4),
+            SIMD_TUPLE(imm_v128_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v128_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v128_padd_u8, 0U, 0U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V64V64), SIMD_TUPLE(v128_from_v64, 0U, 0U),
             SIMD_TUPLE(v128_zip_8, 0U, 0U), SIMD_TUPLE(v128_zip_16, 0U, 0U),
             SIMD_TUPLE(v128_zip_32, 0U, 0U), SIMD_TUPLE(v128_mul_s16, 0U, 0U));
@@ -646,16 +756,17 @@
             SIMD_TUPLE(v128_unpack_u16_s32, 0U, 0U),
             SIMD_TUPLE(v128_unpack_s16_s32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
-            SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
-            SIMD_TUPLE(v128_shl_16, 15U, 32U),
-            SIMD_TUPLE(v128_shr_u16, 15U, 32U),
-            SIMD_TUPLE(v128_shr_s16, 15U, 32U),
-            SIMD_TUPLE(v128_shl_32, 31U, 32U),
-            SIMD_TUPLE(v128_shr_u32, 31U, 32U),
-            SIMD_TUPLE(v128_shr_s32, 31U, 32U));
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V128_V128U32), SIMD_TUPLE(v128_shl_8, 7U, 32U),
+    SIMD_TUPLE(v128_shr_u8, 7U, 32U), SIMD_TUPLE(v128_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v128_shl_16, 15U, 32U), SIMD_TUPLE(v128_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v128_shr_s16, 15U, 32U), SIMD_TUPLE(v128_shl_32, 31U, 32U),
+    SIMD_TUPLE(v128_shr_u32, 31U, 32U), SIMD_TUPLE(v128_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v128_shl_64, 63U, 32U), SIMD_TUPLE(v128_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v128_shr_s64, 63U, 32U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V128), SIMD_TUPLE(v128_low_u32, 0U, 0U),
+            SIMD_TUPLE(v128_movemask_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V128), SIMD_TUPLE(v128_hadd_u8, 0U, 0U));
 
@@ -668,16 +779,23 @@
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U32), SIMD_TUPLE(v128_dup_32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128),
-            SIMD_TUPLE(v128_dotp_s16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(V128_U64), SIMD_TUPLE(v128_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V128V128), SIMD_TUPLE(v128_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v128_dotp_su8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256V256), SIMD_TUPLE(v256_sad_u8, 0U, 0U),
-            SIMD_TUPLE(v256_ssd_u8, 0U, 0U));
+            SIMD_TUPLE(v256_ssd_u8, 0U, 0U), SIMD_TUPLE(v256_sad_u16, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256), SIMD_TUPLE(v256_hadd_u8, 0U, 0U),
+            SIMD_TUPLE(v256_low_u64, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256),
-            SIMD_TUPLE(v256_dotp_s16, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(S64_V256V256), SIMD_TUPLE(v256_dotp_s16, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_s32, 0U, 0U),
+            SIMD_TUPLE(v256_dotp_su8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U64_V256V256), SIMD_TUPLE(v256_ssd_s16, 0U, 0U));
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V256_V256V256), SIMD_TUPLE(v256_add_8, 0U, 0U),
@@ -709,10 +827,16 @@
 
 INSTANTIATE(
     ARCH, ARCH_POSTFIX(V256_V256V256_Part2), SIMD_TUPLE(v256_cmpeq_8, 0U, 0U),
+    SIMD_TUPLE(v256_min_s32, 0U, 0U), SIMD_TUPLE(v256_max_s32, 0U, 0U),
+    SIMD_TUPLE(v256_add_64, 0U, 0U), SIMD_TUPLE(v256_sub_64, 0U, 0U),
     SIMD_TUPLE(v256_cmpgt_s16, 0U, 0U), SIMD_TUPLE(v256_cmplt_s16, 0U, 0U),
-    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_shuffle_8, 15U, 8U),
-    SIMD_TUPLE(v256_pshuffle_8, 15U, 8U), SIMD_TUPLE(imm_v256_align<1>, 0U, 0U),
-    SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_cmpeq_16, 0U, 0U), SIMD_TUPLE(v256_cmpgt_s32, 0U, 0U),
+    SIMD_TUPLE(v256_cmplt_s32, 0U, 0U), SIMD_TUPLE(v256_cmpeq_32, 0U, 0U),
+    SIMD_TUPLE(v256_shuffle_8, 31U, 8U), SIMD_TUPLE(v256_pshuffle_8, 15U, 8U),
+    SIMD_TUPLE(imm_v256_align<1>, 0U, 0U), SIMD_TUPLE(v256_sadd_s8, 0U, 0U),
+    SIMD_TUPLE(v256_sadd_u8, 0U, 0U), SIMD_TUPLE(v256_pack_s32_u16, 0U, 0U),
+    SIMD_TUPLE(v256_rdavg_u16, 0U, 0U), SIMD_TUPLE(imm_v256_align<2>, 0U, 0U),
+    SIMD_TUPLE(v256_unziphi_64, 0U, 0U), SIMD_TUPLE(v256_unziplo_64, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<3>, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<4>, 0U, 0U),
     SIMD_TUPLE(imm_v256_align<5>, 0U, 0U),
@@ -754,14 +878,14 @@
             SIMD_TUPLE(v256_unpack_u16_s32, 0U, 0U),
             SIMD_TUPLE(v256_unpack_s16_s32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
-            SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
-            SIMD_TUPLE(v256_shl_16, 15U, 32U),
-            SIMD_TUPLE(v256_shr_u16, 15U, 32U),
-            SIMD_TUPLE(v256_shr_s16, 15U, 32U),
-            SIMD_TUPLE(v256_shl_32, 31U, 32U),
-            SIMD_TUPLE(v256_shr_u32, 31U, 32U),
-            SIMD_TUPLE(v256_shr_s32, 31U, 32U));
+INSTANTIATE(
+    ARCH, ARCH_POSTFIX(V256_V256U32), SIMD_TUPLE(v256_shl_8, 7U, 32U),
+    SIMD_TUPLE(v256_shr_u8, 7U, 32U), SIMD_TUPLE(v256_shr_s8, 7U, 32U),
+    SIMD_TUPLE(v256_shl_16, 15U, 32U), SIMD_TUPLE(v256_shr_u16, 15U, 32U),
+    SIMD_TUPLE(v256_shr_s16, 15U, 32U), SIMD_TUPLE(v256_shl_32, 31U, 32U),
+    SIMD_TUPLE(v256_shr_u32, 31U, 32U), SIMD_TUPLE(v256_shr_s32, 31U, 32U),
+    SIMD_TUPLE(v256_shl_64, 63U, 32U), SIMD_TUPLE(v256_shr_u64, 63U, 32U),
+    SIMD_TUPLE(v256_shr_s64, 63U, 32U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256), SIMD_TUPLE(v256_abs_s8, 0U, 0U),
             SIMD_TUPLE(v256_abs_s16, 0U, 0U), SIMD_TUPLE(v256_padd_s16, 0U, 0U),
@@ -909,13 +1033,71 @@
             SIMD_TUPLE(imm_v256_shr_n_s32<24>, 0U, 0U),
             SIMD_TUPLE(imm_v256_shr_n_s32<28>, 0U, 0U));
 
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256_Part4),
+            SIMD_TUPLE(imm_v256_shl_n_64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shl_n_64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_u64<60>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<1>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<4>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<8>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<12>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<16>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<20>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<24>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<28>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<32>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<36>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<40>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<44>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<48>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<52>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<56>, 0U, 0U),
+            SIMD_TUPLE(imm_v256_shr_n_s64<60>, 0U, 0U),
+            SIMD_TUPLE(v256_padd_u8, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_V256V256V256),
+            SIMD_TUPLE(v256_blend_8, 0U, 0U),
+            SIMD_TUPLE(v256_wideshuffle_8, 63U, 8U));
+
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U8), SIMD_TUPLE(v256_dup_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U16), SIMD_TUPLE(v256_dup_16, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U32), SIMD_TUPLE(v256_dup_32, 0U, 0U));
 
-INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U));
+INSTANTIATE(ARCH, ARCH_POSTFIX(V256_U64), SIMD_TUPLE(v256_dup_64, 0U, 0U));
+
+INSTANTIATE(ARCH, ARCH_POSTFIX(U32_V256), SIMD_TUPLE(v256_low_u32, 0U, 0U),
+            SIMD_TUPLE(v256_movemask_8, 0U, 0U));
 
 INSTANTIATE(ARCH, ARCH_POSTFIX(V64_V256), SIMD_TUPLE(v256_low_v64, 0U, 0U));