[zlib] Adding PMULL based CRC-32 for aarch64
Using the polynomial multiplication instruction allows to improve a bit
more the decompression speed of gzipped content in zlib.
The average gains are not massive though (around +4.9%), but is more expressive
for some relevant content (e.g. +10% for HTMLx4 & JPEG, +6% for JS) in more
recent chip designs.
Since Chrome is distributed as a 64bit binary (i.e. aarch64) only for
higher end devices, it should be safe from a performance perspective.
For lower spec devices, the same serial crc32 code using the crypto
extensions is used, therefore no change on behavior for 32bit.
Bug: 873725
Change-Id: I33b9b345b82b5256ec922324984f9a371949bbe6
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2422723
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1006184}
NOKEYCHECK=True
GitOrigin-RevId: a0771caebe87477558454cc6d793562e3afe74ac
diff --git a/chromeconf.h b/chromeconf.h
index c365fa9..7c2241a 100644
--- a/chromeconf.h
+++ b/chromeconf.h
@@ -194,6 +194,7 @@
#define arm_cpu_enable_pmull Cr_z_arm_cpu_enable_pmull
#define arm_check_features Cr_z_arm_check_features
#define armv8_crc32_little Cr_z_armv8_crc32_little
+#define armv8_crc32_pmull_little Cr_z_armv8_crc32_pmull_little
/* Symbols added by cpu_features.c */
#define cpu_check_features Cr_z_cpu_check_features
diff --git a/cpu_features.c b/cpu_features.c
index 70f01be..9391d7b 100644
--- a/cpu_features.c
+++ b/cpu_features.c
@@ -18,13 +18,16 @@
/* TODO(cavalcantii): remove checks for x86_flags on deflate.
*/
#if defined(ARMV8_OS_MACOS)
-/* crc32 is a baseline feature in ARMv8.1-A, and macOS running on arm64 is new
- * enough that this can be assumed without runtime detection. */
+/* Crypto extensions (crc32/pmull) are a baseline feature in ARMv8.1-A, and
+ * OSX running on arm64 is new enough that these can be assumed without
+ * runtime detection.
+ */
int ZLIB_INTERNAL arm_cpu_enable_crc32 = 1;
+int ZLIB_INTERNAL arm_cpu_enable_pmull = 1;
#else
int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0;
-#endif
int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
+#endif
int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
diff --git a/crc32.c b/crc32.c
index 142f1e8..e60c372 100644
--- a/crc32.c
+++ b/crc32.c
@@ -1101,23 +1101,29 @@
const unsigned char FAR *buf;
uInt len;
{
-#if defined(CRC32_ARMV8_CRC32)
- /* We got to verify ARM CPU features, so exploit the common usage pattern
- * of calling this function with Z_NULL for an initial valid crc value.
- * This allows to cache the result of the feature check and avoid extraneous
- * function calls.
- * TODO: try to move this to crc32_z if we don't loose performance on ARM.
+ /* Some bots compile with optimizations disabled, others will emulate
+ * ARM on x86 and other weird combinations.
*/
- if (buf == Z_NULL) {
- if (!len) /* Assume user is calling crc32(0, NULL, 0); */
- cpu_check_features();
- return 0UL;
- }
+#if defined(CRC32_ARMV8_CRC32)
+ if (arm_cpu_enable_crc32) {
+#if defined(__aarch64__)
+ /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
+ if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
+ const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
+ crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
+ /* Check remaining data. */
+ len -= chunk_size;
+ if (!len)
+ return crc;
- if (arm_cpu_enable_crc32)
- return armv8_crc32_little(crc, buf, len);
+ /* Fall through for the remaining data. */
+ buf += chunk_size;
+ }
#endif
- return crc32_z(crc, buf, len);
+ return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
+ }
+#endif
+ return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
}
/* ========================================================================= */
diff --git a/crc32_simd.c b/crc32_simd.c
index c8e5592..14a8534 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -157,8 +157,6 @@
#elif defined(CRC32_ARMV8_CRC32)
/* CRC32 checksums using ARMv8-a crypto instructions.
- *
- * TODO: implement a version using the PMULL instruction.
*/
#if defined(__clang__)
@@ -178,13 +176,23 @@
* feature for this target (ignoring feature)." This appears to be a harmless
* bug in clang.
*/
+/* XXX: Cannot hook into builtins with XCode for arm64. */
+#if !defined(ARMV8_OS_MACOS)
#define __crc32b __builtin_arm_crc32b
#define __crc32d __builtin_arm_crc32d
#define __crc32w __builtin_arm_crc32w
#define __crc32cw __builtin_arm_crc32cw
+#endif
+
+/* We need some extra types for using PMULL.
+ */
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#include <arm_acle.h>
+#endif
#if defined(__aarch64__)
-#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
+#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
#else // !defined(__aarch64__)
#define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
#endif // defined(__aarch64__)
@@ -200,9 +208,10 @@
#endif
TARGET_ARMV8_WITH_CRC
-uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
- const unsigned char *buf,
- z_size_t len)
+uint32_t ZLIB_INTERNAL armv8_crc32_little(
+ const unsigned char *buf,
+ z_size_t len,
+ uint32_t crc)
{
uint32_t c = (uint32_t) ~crc;
@@ -240,4 +249,178 @@
return ~c;
}
+#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
+
+/*
+ * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ * V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
+{
+ uint8x16_t r;
+ __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
+ : "=w" (r) : "w" (a), "w" (b) );
+ return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
+{
+ uint8x16_t r;
+ __asm__ __volatile__ ("pmull %0.1q, %1.1d, %2.1d \n\t"
+ : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
+ return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
+{
+ uint8x16_t r;
+ __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
+ : "=w" (r) : "w" (a), "w" (b) );
+ return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
+ const unsigned char *buf,
+ z_size_t len,
+ uint32_t crc)
+{
+ /*
+ * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+ * the CRC32+Barrett polynomials given at the end of the paper.
+ */
+ static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+ static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+ static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+ static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+ uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+ /*
+ * There's at least one block of 64.
+ */
+ x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
+ x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
+ x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
+ x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+ x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
+
+ x0 = vld1q_u64(k1k2);
+
+ buf += 64;
+ len -= 64;
+
+ /*
+ * Parallel fold blocks of 64, if any.
+ */
+ while (len >= 64)
+ {
+ x5 = (uint64x2_t) pmull_lo(x1, x0);
+ x6 = (uint64x2_t) pmull_lo(x2, x0);
+ x7 = (uint64x2_t) pmull_lo(x3, x0);
+ x8 = (uint64x2_t) pmull_lo(x4, x0);
+
+ y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
+ y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
+ y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
+ y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+ x1 = (uint64x2_t) pmull_hi(x1, x0);
+ x2 = (uint64x2_t) pmull_hi(x2, x0);
+ x3 = (uint64x2_t) pmull_hi(x3, x0);
+ x4 = (uint64x2_t) pmull_hi(x4, x0);
+
+ x1 = veorq_u64(x1, x5);
+ x2 = veorq_u64(x2, x6);
+ x3 = veorq_u64(x3, x7);
+ x4 = veorq_u64(x4, x8);
+
+ x1 = veorq_u64(x1, y5);
+ x2 = veorq_u64(x2, y6);
+ x3 = veorq_u64(x3, y7);
+ x4 = veorq_u64(x4, y8);
+
+ buf += 64;
+ len -= 64;
+ }
+
+ /*
+ * Fold into 128-bits.
+ */
+ x0 = vld1q_u64(k3k4);
+
+ x5 = (uint64x2_t) pmull_lo(x1, x0);
+ x1 = (uint64x2_t) pmull_hi(x1, x0);
+ x1 = veorq_u64(x1, x2);
+ x1 = veorq_u64(x1, x5);
+
+ x5 = (uint64x2_t) pmull_lo(x1, x0);
+ x1 = (uint64x2_t) pmull_hi(x1, x0);
+ x1 = veorq_u64(x1, x3);
+ x1 = veorq_u64(x1, x5);
+
+ x5 = (uint64x2_t) pmull_lo(x1, x0);
+ x1 = (uint64x2_t) pmull_hi(x1, x0);
+ x1 = veorq_u64(x1, x4);
+ x1 = veorq_u64(x1, x5);
+
+ /*
+ * Single fold blocks of 16, if any.
+ */
+ while (len >= 16)
+ {
+ x2 = vld1q_u64((const uint64_t *)buf);
+
+ x5 = (uint64x2_t) pmull_lo(x1, x0);
+ x1 = (uint64x2_t) pmull_hi(x1, x0);
+ x1 = veorq_u64(x1, x2);
+ x1 = veorq_u64(x1, x5);
+
+ buf += 16;
+ len -= 16;
+ }
+
+ /*
+ * Fold 128-bits to 64-bits.
+ */
+ static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
+
+ x2 = (uint64x2_t) pmull_01(x1, x0);
+ x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
+ x3 = (uint64x2_t) vld1q_u32(mask);
+ x1 = veorq_u64(x1, x2);
+
+ x0 = vld1q_u64(k5k0);
+
+ x2 = (uint64x2_t) pmull_01(x2, x0);
+ x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
+ x1 = vandq_u64(x1, x3);
+ x1 = (uint64x2_t) pmull_lo(x1, x0);
+ x1 = veorq_u64(x1, x2);
+
+ /*
+ * Barret reduce to 32-bits.
+ */
+ x0 = vld1q_u64(poly);
+
+ x2 = vandq_u64(x1, x3);
+ x2 = (uint64x2_t) pmull_01(x2, x0);
+ x2 = vandq_u64(x2, x3);
+ x2 = (uint64x2_t) pmull_lo(x2, x0);
+ x1 = veorq_u64(x1, x2);
+
+ /*
+ * Return the crc32.
+ */
+ return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
+}
+#endif /* aarch64 specific code. */
+
#endif
diff --git a/crc32_simd.h b/crc32_simd.h
index 68bc235..6985cbb 100644
--- a/crc32_simd.h
+++ b/crc32_simd.h
@@ -15,10 +15,9 @@
* crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
* length must be at least 64, and a multiple of 16.
*/
-uint32_t ZLIB_INTERNAL crc32_sse42_simd_(
- const unsigned char *buf,
- z_size_t len,
- uint32_t crc);
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
+ z_size_t len,
+ uint32_t crc);
/*
* crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
@@ -30,7 +29,23 @@
/*
* CRC32 checksums using ARMv8-a crypto instructions.
*/
-uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
- const unsigned char* buf,
- z_size_t len);
+uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
+ z_size_t len,
+ uint32_t crc);
+/* aarch64 specific code. */
+#if defined(__aarch64__)
+
+/* 128 is the sweet spot at the time of coding (late 2020). */
+#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
+#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
+
+/*
+ * CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
+ z_size_t len,
+ uint32_t crc);
+
+#endif