Hamming Distance using 16 bit accumulators
Summing 16 bit hamming codes restricts the maximum length,
but saves an inner loop instruction. The outer loop can sum the
values.
32 bit Neon
Now BenchmarkHammingDistance_Opt (78 ms)
Was BenchmarkHammingDistance_Opt (92 ms)
64 bit Neon
Now BenchmarkHammingDistance_Opt (85 ms)
Was BenchmarkHammingDistance_Opt (92 ms)
R=wangcheng@google.com
TBR=kjellander@chromium.org
BUG=libyuv:701
TEST=BenchmarkHammingDistance
Change-Id: Ie40f0eac2f3339c33b833b42af5d394b122066ae
Reviewed-on: https://chromium-review.googlesource.com/526932
Reviewed-by: Frank Barchard <fbarchard@google.com>
Reviewed-by: Cheng Wang <wangcheng@google.com>
Commit-Queue: Frank Barchard <fbarchard@google.com>
diff --git a/README.chromium b/README.chromium
index 27305ad..81b2541 100644
--- a/README.chromium
+++ b/README.chromium
@@ -1,6 +1,6 @@
Name: libyuv
URL: http://code.google.com/p/libyuv/
-Version: 1659
+Version: 1660
License: BSD
License File: LICENSE
diff --git a/include/libyuv/version.h b/include/libyuv/version.h
index eeb8c89..3f7f757 100644
--- a/include/libyuv/version.h
+++ b/include/libyuv/version.h
@@ -11,6 +11,6 @@
#ifndef INCLUDE_LIBYUV_VERSION_H_
#define INCLUDE_LIBYUV_VERSION_H_
-#define LIBYUV_VERSION 1659
+#define LIBYUV_VERSION 1660
#endif // INCLUDE_LIBYUV_VERSION_H_
diff --git a/source/compare_neon.cc b/source/compare_neon.cc
index 2d84c6c..3066d74 100644
--- a/source/compare_neon.cc
+++ b/source/compare_neon.cc
@@ -22,12 +22,12 @@
!defined(__aarch64__)
// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
- // Load constants.
- "vmov.u8 q4, #0 \n" // accumulator
+ "vmov.u16 q4, #0 \n" // accumulator
"1: \n"
"vld1.8 {q0, q1}, [%0]! \n"
@@ -38,13 +38,12 @@
"vcnt.i8 q1, q1 \n"
"subs %2, %2, #32 \n"
"vadd.u8 q0, q0, q1 \n" // 16 byte counts
- "vpaddl.u8 q0, q0 \n" // 8 shorts
- "vpadal.u16 q4, q0 \n" // 4 ints
+ "vpadal.u8 q4, q0 \n" // 8 shorts
"bgt 1b \n"
- "vpadd.u32 d0, d8, d9 \n"
+ "vpaddl.u16 q0, q4 \n" // 4 ints
+ "vpadd.u32 d0, d0, d1 \n"
"vpadd.u32 d0, d0, d0 \n"
- // Move distance to return register.
"vmov.32 %3, d0[0] \n"
: "+r"(src_a),
diff --git a/source/compare_neon64.cc b/source/compare_neon64.cc
index 2c0b68b..8e1ff58 100644
--- a/source/compare_neon64.cc
+++ b/source/compare_neon64.cc
@@ -21,10 +21,11 @@
#if !defined(LIBYUV_DISABLE_NEON) && defined(__aarch64__)
// 256 bits at a time
+// uses short accumulator which restricts count to 131 KB
uint32 HammingDistance_NEON(const uint8* src_a, const uint8* src_b, int count) {
uint32 diff;
asm volatile (
- "movi v4.4s, #0 \n"
+ "movi v4.8h, #0 \n"
"1: \n"
"ld1 {v0.16b, v1.16b}, [%0], #32 \n"
@@ -35,11 +36,10 @@
"cnt v1.16b, v1.16b \n"
"subs %w2, %w2, #32 \n"
"add v0.16b, v0.16b, v1.16b \n"
- "uaddlp v0.8h, v0.16b \n"
- "uadalp v4.4s, v0.8h \n"
+ "uadalp v4.8h, v0.16b \n"
"b.gt 1b \n"
- "addv s4, v4.4s \n"
+ "uaddlv s4, v4.8h \n"
"fmov %w3, s4 \n"
: "+r"(src_a),
"+r"(src_b),