[zlib] Adding PMULL based CRC-32 for aarch64

Using the polynomial multiplication instruction allows to improve a bit
more the decompression speed of gzipped content in zlib.

The average gains are not massive though (around +4.9%), but is more expressive
for some relevant content (e.g. +10% for HTMLx4 & JPEG, +6% for JS) in more
recent chip designs.

Since Chrome is distributed as a 64bit binary (i.e. aarch64) only for
higher end devices, it should be safe from a performance perspective.

For lower spec devices, the same serial crc32 code using the crypto
extensions is used, therefore no change on behavior for 32bit.

Bug: 873725
Change-Id: I33b9b345b82b5256ec922324984f9a371949bbe6
Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/2422723
Reviewed-by: Chris Blume <cblume@chromium.org>
Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org>
Cr-Commit-Position: refs/heads/main@{#1006184}
NOKEYCHECK=True
GitOrigin-RevId: a0771caebe87477558454cc6d793562e3afe74ac
diff --git a/chromeconf.h b/chromeconf.h
index c365fa9..7c2241a 100644
--- a/chromeconf.h
+++ b/chromeconf.h
@@ -194,6 +194,7 @@
 #define arm_cpu_enable_pmull Cr_z_arm_cpu_enable_pmull
 #define arm_check_features Cr_z_arm_check_features
 #define armv8_crc32_little Cr_z_armv8_crc32_little
+#define armv8_crc32_pmull_little Cr_z_armv8_crc32_pmull_little
 
 /* Symbols added by cpu_features.c */
 #define cpu_check_features Cr_z_cpu_check_features
diff --git a/cpu_features.c b/cpu_features.c
index 70f01be..9391d7b 100644
--- a/cpu_features.c
+++ b/cpu_features.c
@@ -18,13 +18,16 @@
 /* TODO(cavalcantii): remove checks for x86_flags on deflate.
  */
 #if defined(ARMV8_OS_MACOS)
-/* crc32 is a baseline feature in ARMv8.1-A, and macOS running on arm64 is new
- * enough that this can be assumed without runtime detection. */
+/* Crypto extensions (crc32/pmull) are a baseline feature in ARMv8.1-A, and
+ * OSX running on arm64 is new enough that these can be assumed without
+ * runtime detection.
+ */
 int ZLIB_INTERNAL arm_cpu_enable_crc32 = 1;
+int ZLIB_INTERNAL arm_cpu_enable_pmull = 1;
 #else
 int ZLIB_INTERNAL arm_cpu_enable_crc32 = 0;
-#endif
 int ZLIB_INTERNAL arm_cpu_enable_pmull = 0;
+#endif
 int ZLIB_INTERNAL x86_cpu_enable_sse2 = 0;
 int ZLIB_INTERNAL x86_cpu_enable_ssse3 = 0;
 int ZLIB_INTERNAL x86_cpu_enable_simd = 0;
diff --git a/crc32.c b/crc32.c
index 142f1e8..e60c372 100644
--- a/crc32.c
+++ b/crc32.c
@@ -1101,23 +1101,29 @@
     const unsigned char FAR *buf;
     uInt len;
 {
-#if defined(CRC32_ARMV8_CRC32)
-    /* We got to verify ARM CPU features, so exploit the common usage pattern
-     * of calling this function with Z_NULL for an initial valid crc value.
-     * This allows to cache the result of the feature check and avoid extraneous
-     * function calls.
-     * TODO: try to move this to crc32_z if we don't loose performance on ARM.
+    /* Some bots compile with optimizations disabled, others will emulate
+     * ARM on x86 and other weird combinations.
      */
-    if (buf == Z_NULL) {
-        if (!len) /* Assume user is calling crc32(0, NULL, 0); */
-            cpu_check_features();
-        return 0UL;
-    }
+#if defined(CRC32_ARMV8_CRC32)
+    if (arm_cpu_enable_crc32) {
+#if defined(__aarch64__)
+        /* PMULL is 64bit only, plus code needs at least a 64 bytes buffer. */
+        if (arm_cpu_enable_pmull && (len > Z_CRC32_PMULL_MINIMUM_LENGTH)) {
+            const size_t chunk_size = len & ~Z_CRC32_PMULL_CHUNKSIZE_MASK;
+            crc = ~armv8_crc32_pmull_little(buf, chunk_size, ~(uint32_t)crc);
+            /* Check remaining data. */
+            len -= chunk_size;
+            if (!len)
+                return crc;
 
-    if (arm_cpu_enable_crc32)
-        return armv8_crc32_little(crc, buf, len);
+            /* Fall through for the remaining data. */
+            buf += chunk_size;
+        }
 #endif
-    return crc32_z(crc, buf, len);
+        return armv8_crc32_little(buf, len, crc); /* Armv8@32bit or tail. */
+    }
+#endif
+    return crc32_z(crc, buf, len); /* Armv7 or Armv8 w/o crypto extensions. */
 }
 
 /* ========================================================================= */
diff --git a/crc32_simd.c b/crc32_simd.c
index c8e5592..14a8534 100644
--- a/crc32_simd.c
+++ b/crc32_simd.c
@@ -157,8 +157,6 @@
 #elif defined(CRC32_ARMV8_CRC32)
 
 /* CRC32 checksums using ARMv8-a crypto instructions.
- *
- * TODO: implement a version using the PMULL instruction.
  */
 
 #if defined(__clang__)
@@ -178,13 +176,23 @@
  * feature for this target (ignoring feature)." This appears to be a harmless
  * bug in clang.
  */
+/* XXX: Cannot hook into builtins with XCode for arm64. */
+#if !defined(ARMV8_OS_MACOS)
 #define __crc32b __builtin_arm_crc32b
 #define __crc32d __builtin_arm_crc32d
 #define __crc32w __builtin_arm_crc32w
 #define __crc32cw __builtin_arm_crc32cw
+#endif
+
+/* We need some extra types for using PMULL.
+ */
+#if defined(__aarch64__)
+#include <arm_neon.h>
+#include <arm_acle.h>
+#endif
 
 #if defined(__aarch64__)
-#define TARGET_ARMV8_WITH_CRC __attribute__((target("crc")))
+#define TARGET_ARMV8_WITH_CRC __attribute__((target("aes,crc")))
 #else  // !defined(__aarch64__)
 #define TARGET_ARMV8_WITH_CRC __attribute__((target("armv8-a,crc")))
 #endif  // defined(__aarch64__)
@@ -200,9 +208,10 @@
 #endif
 
 TARGET_ARMV8_WITH_CRC
-uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
-                                          const unsigned char *buf,
-                                          z_size_t len)
+uint32_t ZLIB_INTERNAL armv8_crc32_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
 {
     uint32_t c = (uint32_t) ~crc;
 
@@ -240,4 +249,178 @@
     return ~c;
 }
 
+#if defined(__aarch64__) || defined(ARMV8_OS_MACOS) /* aarch64 specific code. */
+
+/*
+ * crc32_pmull_simd_(): compute the crc32 of the buffer, where the buffer
+ * length must be at least 64, and a multiple of 16. Based on:
+ *
+ * "Fast CRC Computation for Generic Polynomials Using PCLMULQDQ Instruction"
+ *  V. Gopal, E. Ozturk, et al., 2009, http://intel.ly/2ySEwL0
+ */
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_lo(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_01(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull  %0.1q, %1.1d, %2.1d \n\t"
+        : "=w" (r) : "w" (a), "w" (vgetq_lane_u64(b, 1)) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+static inline uint8x16_t pmull_hi(const uint64x2_t a, const uint64x2_t b)
+{
+    uint8x16_t r;
+    __asm__ __volatile__ ("pmull2 %0.1q, %1.2d, %2.2d \n\t"
+        : "=w" (r) : "w" (a), "w" (b) );
+    return r;
+}
+
+TARGET_ARMV8_WITH_CRC
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(
+    const unsigned char *buf,
+    z_size_t len,
+    uint32_t crc)
+{
+    /*
+     * Definitions of the bit-reflected domain constants k1,k2,k3, etc and
+     * the CRC32+Barrett polynomials given at the end of the paper.
+     */
+    static const uint64_t zalign(16) k1k2[] = { 0x0154442bd4, 0x01c6e41596 };
+    static const uint64_t zalign(16) k3k4[] = { 0x01751997d0, 0x00ccaa009e };
+    static const uint64_t zalign(16) k5k0[] = { 0x0163cd6124, 0x0000000000 };
+    static const uint64_t zalign(16) poly[] = { 0x01db710641, 0x01f7011641 };
+
+    uint64x2_t x0, x1, x2, x3, x4, x5, x6, x7, x8, y5, y6, y7, y8;
+
+    /*
+     * There's at least one block of 64.
+     */
+    x1 = vld1q_u64((const uint64_t *)(buf + 0x00));
+    x2 = vld1q_u64((const uint64_t *)(buf + 0x10));
+    x3 = vld1q_u64((const uint64_t *)(buf + 0x20));
+    x4 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+    x1 = veorq_u64(x1, (uint64x2_t) vsetq_lane_u32(crc, vdupq_n_u32(0), 0));
+
+    x0 = vld1q_u64(k1k2);
+
+    buf += 64;
+    len -= 64;
+
+    /*
+     * Parallel fold blocks of 64, if any.
+     */
+    while (len >= 64)
+    {
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x6 = (uint64x2_t) pmull_lo(x2, x0);
+        x7 = (uint64x2_t) pmull_lo(x3, x0);
+        x8 = (uint64x2_t) pmull_lo(x4, x0);
+
+        y5 = vld1q_u64((const uint64_t *)(buf + 0x00));
+        y6 = vld1q_u64((const uint64_t *)(buf + 0x10));
+        y7 = vld1q_u64((const uint64_t *)(buf + 0x20));
+        y8 = vld1q_u64((const uint64_t *)(buf + 0x30));
+
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x2 = (uint64x2_t) pmull_hi(x2, x0);
+        x3 = (uint64x2_t) pmull_hi(x3, x0);
+        x4 = (uint64x2_t) pmull_hi(x4, x0);
+
+        x1 = veorq_u64(x1, x5);
+        x2 = veorq_u64(x2, x6);
+        x3 = veorq_u64(x3, x7);
+        x4 = veorq_u64(x4, x8);
+
+        x1 = veorq_u64(x1, y5);
+        x2 = veorq_u64(x2, y6);
+        x3 = veorq_u64(x3, y7);
+        x4 = veorq_u64(x4, y8);
+
+        buf += 64;
+        len -= 64;
+    }
+
+    /*
+     * Fold into 128-bits.
+     */
+    x0 = vld1q_u64(k3k4);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x2);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x3);
+    x1 = veorq_u64(x1, x5);
+
+    x5 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = (uint64x2_t) pmull_hi(x1, x0);
+    x1 = veorq_u64(x1, x4);
+    x1 = veorq_u64(x1, x5);
+
+    /*
+     * Single fold blocks of 16, if any.
+     */
+    while (len >= 16)
+    {
+        x2 = vld1q_u64((const uint64_t *)buf);
+
+        x5 = (uint64x2_t) pmull_lo(x1, x0);
+        x1 = (uint64x2_t) pmull_hi(x1, x0);
+        x1 = veorq_u64(x1, x2);
+        x1 = veorq_u64(x1, x5);
+
+        buf += 16;
+        len -= 16;
+    }
+
+    /*
+     * Fold 128-bits to 64-bits.
+     */
+    static uint32_t zalign(16) mask[] = { ~0u, 0u, ~0u, 0u };
+
+    x2 = (uint64x2_t) pmull_01(x1, x0);
+    x1 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 8);
+    x3 = (uint64x2_t) vld1q_u32(mask);
+    x1 = veorq_u64(x1, x2);
+
+    x0 = vld1q_u64(k5k0);
+
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = (uint64x2_t) vextq_u8(vreinterpretq_u8_u64(x1), vdupq_n_u8(0), 4);
+    x1 = vandq_u64(x1, x3);
+    x1 = (uint64x2_t) pmull_lo(x1, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Barret reduce to 32-bits.
+     */
+    x0 = vld1q_u64(poly);
+
+    x2 = vandq_u64(x1, x3);
+    x2 = (uint64x2_t) pmull_01(x2, x0);
+    x2 = vandq_u64(x2, x3);
+    x2 = (uint64x2_t) pmull_lo(x2, x0);
+    x1 = veorq_u64(x1, x2);
+
+    /*
+     * Return the crc32.
+     */
+    return vgetq_lane_u32(vreinterpretq_u32_u64(x1), 1);
+}
+#endif /* aarch64 specific code. */
+
 #endif
diff --git a/crc32_simd.h b/crc32_simd.h
index 68bc235..6985cbb 100644
--- a/crc32_simd.h
+++ b/crc32_simd.h
@@ -15,10 +15,9 @@
  * crc32_sse42_simd_(): compute the crc32 of the buffer, where the buffer
  * length must be at least 64, and a multiple of 16.
  */
-uint32_t ZLIB_INTERNAL crc32_sse42_simd_(
-    const unsigned char *buf,
-    z_size_t len,
-    uint32_t crc);
+uint32_t ZLIB_INTERNAL crc32_sse42_simd_(const unsigned char* buf,
+                                         z_size_t len,
+                                         uint32_t crc);
 
 /*
  * crc32_sse42_simd_ buffer size constraints: see the use in zlib/crc32.c
@@ -30,7 +29,23 @@
 /*
  * CRC32 checksums using ARMv8-a crypto instructions.
  */
-uint32_t ZLIB_INTERNAL armv8_crc32_little(unsigned long crc,
-                                          const unsigned char* buf,
-                                          z_size_t len);
+uint32_t ZLIB_INTERNAL armv8_crc32_little(const unsigned char* buf,
+                                          z_size_t len,
+                                          uint32_t crc);
 
+/* aarch64 specific code. */
+#if defined(__aarch64__)
+
+/* 128 is the sweet spot at the time of coding (late 2020). */
+#define Z_CRC32_PMULL_MINIMUM_LENGTH 128
+#define Z_CRC32_PMULL_CHUNKSIZE_MASK 15
+
+/*
+ * CRC32 checksums using ARMv8-a PMULL instructions, where the buffer
+ * length must be at least 64, and a multiple of 16.
+ */
+uint32_t ZLIB_INTERNAL armv8_crc32_pmull_little(const unsigned char* buf,
+                                                z_size_t len,
+                                                uint32_t crc);
+
+#endif