Add Neon path of aom_mse_16xh_16bit

Add aom_mse_16xh_16bit_neon and the corresponding unit tests.

Change-Id: Icb1ca438a93a04dab9f0073da0e8592efc3532ec
diff --git a/aom_dsp/aom_dsp_rtcd_defs.pl b/aom_dsp/aom_dsp_rtcd_defs.pl
index 5791f56..4027289 100755
--- a/aom_dsp/aom_dsp_rtcd_defs.pl
+++ b/aom_dsp/aom_dsp_rtcd_defs.pl
@@ -1379,7 +1379,7 @@
   specialize qw/aom_mse_wxh_16bit  sse2 avx2 neon/;
 
   add_proto qw/uint64_t/, "aom_mse_16xh_16bit", "uint8_t *dst, int dstride,uint16_t *src, int w, int h";
-  specialize qw/aom_mse_16xh_16bit sse2 avx2/;
+  specialize qw/aom_mse_16xh_16bit sse2 avx2 neon/;
 
   foreach (@encoder_block_sizes) {
     ($w, $h) = @$_;
diff --git a/aom_dsp/arm/variance_neon.c b/aom_dsp/arm/variance_neon.c
index ebcb5e3..357a083 100644
--- a/aom_dsp/arm/variance_neon.c
+++ b/aom_dsp/arm/variance_neon.c
@@ -454,3 +454,17 @@
 
   return horizontal_add_s32x4(vaddq_s32(sse[0], sse[1]));
 }
+
+uint64_t aom_mse_16xh_16bit_neon(uint8_t *dst, int dstride, uint16_t *src,
+                                 int w, int h) {
+  uint64x2_t sum = vdupq_n_u64(0);
+
+  int num_blks = 16 / w;
+  do {
+    sum += mse_wxh_16bit(dst, dstride, src, w, w, h);
+    dst += w;
+    src += w * h;
+  } while (--num_blks != 0);
+
+  return horizontal_add_u64x2(sum);
+}
diff --git a/test/variance_test.cc b/test/variance_test.cc
index 722bec7..c1963e0 100644
--- a/test/variance_test.cc
+++ b/test/variance_test.cc
@@ -3283,6 +3283,13 @@
                       MseWxHParams(2, 3, &aom_mse_wxh_16bit_neon, 8),
                       MseWxHParams(2, 2, &aom_mse_wxh_16bit_neon, 8)));
 
+INSTANTIATE_TEST_SUITE_P(
+    NEON, Mse16xHTest,
+    ::testing::Values(Mse16xHParams(3, 3, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(3, 2, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(2, 3, &aom_mse_16xh_16bit_neon, 8),
+                      Mse16xHParams(2, 2, &aom_mse_16xh_16bit_neon, 8)));
+
 INSTANTIATE_TEST_SUITE_P(NEON, SumOfSquaresTest,
                          ::testing::Values(aom_get_mb_ss_neon));