Add aom_paeth_predictor_16x64 ssse3,avx2

Change-Id: Ie8f0bd4807e6c0d172da31774167f3c8abd3ca58
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 30d5872..c72e063 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -156,6 +156,7 @@
 specialize qw/aom_paeth_predictor_16x8 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_16x32 ssse3 avx2/;
+specialize qw/aom_paeth_predictor_16x64 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x16 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x32 ssse3 avx2/;
 specialize qw/aom_paeth_predictor_32x64 ssse3 avx2/;
diff --git a/aom_dsp/x86/intrapred_avx2.c b/aom_dsp/x86/intrapred_avx2.c
index 5c852ac..1a0dcc5 100644
--- a/aom_dsp/x86/intrapred_avx2.c
+++ b/aom_dsp/x86/intrapred_avx2.c
@@ -594,6 +594,26 @@
   }
 }
 
+void aom_paeth_predictor_16x64_avx2(uint8_t *dst, ptrdiff_t stride,
+                                    const uint8_t *above, const uint8_t *left) {
+  const __m256i tl16 = _mm256_set1_epi16((uint16_t)above[-1]);
+  const __m256i one = _mm256_set1_epi16(1);
+  const __m256i top = get_top_vector(above);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m256i l = get_left_vector(left + j * 16);
+    __m256i rep = _mm256_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m256i l16 = _mm256_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top, &tl16);
+
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm256_add_epi16(rep, one);
+    }
+  }
+}
+
 // Return 32 8-bit pixels in one row (__m256i)
 static INLINE __m256i paeth_32x1_pred(const __m256i *left, const __m256i *top0,
                                       const __m256i *top1,
diff --git a/aom_dsp/x86/intrapred_ssse3.c b/aom_dsp/x86/intrapred_ssse3.c
index 47abcb8..361d9a8 100644
--- a/aom_dsp/x86/intrapred_ssse3.c
+++ b/aom_dsp/x86/intrapred_ssse3.c
@@ -234,6 +234,29 @@
   }
 }
 
+void aom_paeth_predictor_16x64_ssse3(uint8_t *dst, ptrdiff_t stride,
+                                     const uint8_t *above,
+                                     const uint8_t *left) {
+  const __m128i t = _mm_load_si128((const __m128i *)above);
+  const __m128i zero = _mm_setzero_si128();
+  const __m128i top0 = _mm_unpacklo_epi8(t, zero);
+  const __m128i top1 = _mm_unpackhi_epi8(t, zero);
+  const __m128i tl16 = _mm_set1_epi16((uint16_t)above[-1]);
+  const __m128i one = _mm_set1_epi16(1);
+
+  for (int j = 0; j < 4; ++j) {
+    const __m128i l = _mm_load_si128((const __m128i *)(left + j * 16));
+    __m128i rep = _mm_set1_epi16(0x8000);
+    for (int i = 0; i < 16; ++i) {
+      const __m128i l16 = _mm_shuffle_epi8(l, rep);
+      const __m128i row = paeth_16x1_pred(&l16, &top0, &top1, &tl16);
+      _mm_store_si128((__m128i *)dst, row);
+      dst += stride;
+      rep = _mm_add_epi16(rep, one);
+    }
+  }
+}
+
 void aom_paeth_predictor_32x16_ssse3(uint8_t *dst, ptrdiff_t stride,
                                      const uint8_t *above,
                                      const uint8_t *left) {
diff --git a/test/test_intra_pred_speed.cc b/test/test_intra_pred_speed.cc
index 2d3f8de..43a22f7 100644
--- a/test/test_intra_pred_speed.cc
+++ b/test/test_intra_pred_speed.cc
@@ -614,7 +614,8 @@
 INTRA_PRED_TEST(SSSE3_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_ssse3,
                 aom_smooth_predictor_16x32_ssse3, NULL, NULL)
-INTRA_PRED_TEST(SSSE3_3, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL, NULL,
+INTRA_PRED_TEST(SSSE3_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_ssse3,
                 aom_smooth_predictor_16x64_ssse3, NULL, NULL)
 #endif  // HAVE_SSSE3
 
@@ -625,6 +626,8 @@
                 aom_paeth_predictor_16x8_avx2, NULL, NULL, NULL)
 INTRA_PRED_TEST(AVX2_3, TX_16X32, NULL, NULL, NULL, NULL, NULL, NULL,
                 aom_paeth_predictor_16x32_avx2, NULL, NULL, NULL)
+INTRA_PRED_TEST(AVX2_4, TX_16X64, NULL, NULL, NULL, NULL, NULL, NULL,
+                aom_paeth_predictor_16x64_avx2, NULL, NULL, NULL)
 #endif  // HAVE_AVX2
 
 #if HAVE_DSPR2