Hamming Distance using 16 bit accumulators Summing 16 bit hamming codes restricts the maximum length, but saves an inner loop instruction. The outer loop can sum the values. 32 bit Neon Now BenchmarkHammingDistance_Opt (78 ms) Was BenchmarkHammingDistance_Opt (92 ms) 64 bit Neon Now BenchmarkHammingDistance_Opt (85 ms) Was BenchmarkHammingDistance_Opt (92 ms) R=wangcheng@google.com TBR=kjellander@chromium.org BUG=libyuv:701 TEST=BenchmarkHammingDistance Change-Id: Ie40f0eac2f3339c33b833b42af5d394b122066ae Reviewed-on: https://chromium-review.googlesource.com/526932 Reviewed-by: Frank Barchard <fbarchard@google.com> Reviewed-by: Cheng Wang <wangcheng@google.com> Commit-Queue: Frank Barchard <fbarchard@google.com>

commit: d981495b4207d2c079dbbf690788858c17e145bb [log] [tgz]
author: Frank Barchard <fbarchard@google.com> Wed Jun 07 15:19:25 2017 -0700
committer: Commit Bot <commit-bot@chromium.org> Wed Jun 07 23:23:24 2017 +0000
tree: be43a03932fe1a9a7e60ad07715f09672312ad07
parent: 790e0634a8a974cdc4721e5de34a06dc4961f7fa [diff]
diff --git a/README.chromium b/README.chromium
index 27305ad..81b2541 100644
--- a/README.chromium
+++ b/README.chromium

@@ -1,6 +1,6 @@
 Name: libyuv
 URL: http://code.google.com/p/libyuv/
-Version: 1659
+Version: 1660
 License: BSD
 License File: LICENSE
 

diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index eeb8c89..3f7f757 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h

@@ -11,6 +11,6 @@
 #ifndef INCLUDE_LIBYUV_VERSION_H_
 #define INCLUDE_LIBYUV_VERSION_H_
 
-#define LIBYUV_VERSION 1659
+#define LIBYUV_VERSION 1660
 
 #endif  // INCLUDE_LIBYUV_VERSION_H_

diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 2d84c6c..3066d74 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc

@@ -22,12 +22,12 @@
     !defined(__aarch64__)
 
 // 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
   uint32 diff;
 
   asm volatile (
-    // Load constants.
-    "vmov.u8    q4, #0                         \n"  // accumulator
+    "vmov.u16   q4, #0                         \n"  // accumulator
 
   "1:                                          \n"
     "vld1.8     {q0, q1}, [%0]!                \n"
@@ -38,13 +38,12 @@
     "vcnt.i8    q1, q1                         \n"
     "subs       %2, %2, #32                    \n"
     "vadd.u8    q0, q0, q1                     \n"  // 16 byte counts
-    "vpaddl.u8  q0, q0                         \n"  // 8 shorts
-    "vpadal.u16 q4, q0                         \n"  // 4 ints
+    "vpadal.u8  q4, q0                         \n"  // 8 shorts
     "bgt        1b                             \n"
 
-    "vpadd.u32  d0, d8, d9                     \n"
+    "vpaddl.u16 q0, q4                         \n"  // 4 ints
+    "vpadd.u32  d0, d0, d1                     \n"
     "vpadd.u32  d0, d0, d0                     \n"
-    // Move distance to return register.
     "vmov.32    %3, d0[0]                      \n"
  
     : "+r"(src_a),

diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 2c0b68b..8e1ff58 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc

@@ -21,10 +21,11 @@
 #if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
 
 // 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
 uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
   uint32 diff;
   asm volatile (
-    "movi       v4.4s, #0                      \n"
+    "movi       v4.8h, #0                      \n"
 
   "1:                                          \n"
     "ld1        {v0.16b, v1.16b}, [%0], #32    \n"
@@ -35,11 +36,10 @@
     "cnt        v1.16b, v1.16b                 \n"
     "subs       %w2, %w2, #32                  \n"
     "add        v0.16b, v0.16b, v1.16b         \n"
-    "uaddlp     v0.8h, v0.16b                  \n"
-    "uadalp     v4.4s, v0.8h                   \n"
+    "uadalp     v4.8h, v0.16b                  \n"
     "b.gt       1b                             \n"
 
-    "addv       s4, v4.4s                      \n"
+    "uaddlv     s4, v4.8h                      \n"
     "fmov       %w3, s4                        \n"
     : "+r"(src_a),
       "+r"(src_b),
commit	d981495b4207d2c079dbbf690788858c17e145bb	[log] [tgz]
author	Frank Barchard <fbarchard@google.com>	Wed Jun 07 15:19:25 2017 -0700
committer	Commit Bot <commit-bot@chromium.org>	Wed Jun 07 23:23:24 2017 +0000
tree	be43a03932fe1a9a7e60ad07715f09672312ad07
parent	790e0634a8a974cdc4721e5de34a06dc4961f7fa [diff]