[zlib][riscv] Implement generic chunk_copy Back in 2017, Simon Hosie implemented chunk_copy for Arm using NEON instructions, which was later ported to x86-64 by Noel Gordon. The basic idea is to perform wide loads and stores while doing data decompression (i.e. load a single wide vector instead of single byte). The current chunk_copy can be easily ported to other architectures that use fixed length vectors/registers, but doesn't scale so well for architectures with varied vector lengths (e.g. Arm SVE or RISCV RVV 1.0). In any case, it is possible to have a *generic* chunk_copy** relying on the compiler builtins memcopy/memset and this patch introduces this functionality in Chromium zlib. One important detail is that chunk_copy was coded *before* read64le (an optimization suggested by Nigel Tao that requires unaligned loads) and it is a requirement for both read64le and unconditional decoding of literals (suggested by Dougall Johnson). The penalty of unaligned loads in read64le can actually negate the benefits of chunk_copy, which is why we rely on clang flags to allow code generation that deals with the issue. The current patch yielded an average gain of +9.5% on a K230 board, with higher gains for some important content like HTML (+16%) and source code (+11.6%). ** Link: https://github.com/cloudflare/zlib/commit/063def93f91a3f5e463646fb3fe6da5c8705f8e8 Bug: 329282661 Change-Id: Ia32a4a1fed16169a59cd39775fa68f4e675dac09 Reviewed-on: https://chromium-review.googlesource.com/c/chromium/src/+/5402331 Reviewed-by: Chris Blume <cblume@chromium.org> Commit-Queue: Adenilson Cavalcanti <cavalcantii@chromium.org> Cr-Commit-Position: refs/heads/main@{#1283414} NOKEYCHECK=True GitOrigin-RevId: cb959c56ec21abb0526f52b5f66a07fba7b6b145

commit: d076d8bd089843ae105b1aeeda32dbeb667402ef [log] [tgz]
author: Adenilson Cavalcanti <cavalcantii@chromium.org> Fri Apr 05 21:50:32 2024 +0000
committer: Copybara-Service <copybara-worker@google.com> Fri Apr 05 14:55:41 2024 -0700
tree: 2970399bd7f74cb806889708f0e2d560bedcc801
parent: 0e58d440d69cc64ccbe542ea1a968b9befb01544 [diff]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index c3f4247..5db4a6e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt

@@ -79,9 +79,16 @@
     add_definitions(-DRISCV_RVV)
     add_definitions(-DDEFLATE_SLIDE_HASH_RVV)
     add_definitions(-DADLER32_SIMD_RVV)
-    #TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
-    # Required by CPU features detection code.
-    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv")
+
+    # TODO(cavalcantii): add remaining flags as we port optimizations to RVV.
+    # chunk_copy is required for READ64 and unconditional decode of literals.
+    add_definitions(-DINFLATE_CHUNK_GENERIC)
+    add_definitions(-DINFLATE_CHUNK_READ_64LE)
+
+    # Tested with clang-17, unaligned loads are required by read64 & chunk_copy.
+    # TODO(cavalcantii): replace internal clang flags for -munaligned-access
+    # when we have a newer compiler available.
+    SET(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} --target=riscv64-unknown-linux-gnu -march=rv64gcv -Xclang -target-feature -Xclang +unaligned-scalar-mem")
   endif()
 
 endif()
@@ -192,9 +199,14 @@
 if (ENABLE_SIMD_OPTIMIZATIONS)
   if (CMAKE_SYSTEM_PROCESSOR STREQUAL "riscv64")
     message("RISCVV: Add optimizations.")
+    list(REMOVE_ITEM ZLIB_SRCS inflate.c)
     list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.h)
+    list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/chunkcopy.h)
     list(APPEND ZLIB_PRIVATE_HDRS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.h)
+
     list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/adler32_simd.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inffast_chunk.c)
+    list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/contrib/optimizations/inflate.c)
     list(APPEND ZLIB_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/cpu_features.c)
   else()
     list(REMOVE_ITEM ZLIB_SRCS inflate.c)

diff --git a/contrib/optimizations/chunkcopy.h b/contrib/optimizations/chunkcopy.h
index f40546d..97efff3 100644
--- a/contrib/optimizations/chunkcopy.h
+++ b/contrib/optimizations/chunkcopy.h

@@ -21,8 +21,10 @@
 
 #if defined(__clang__) || defined(__GNUC__) || defined(__llvm__)
 #define Z_BUILTIN_MEMCPY __builtin_memcpy
+#define Z_BUILTIN_MEMSET __builtin_memset
 #else
 #define Z_BUILTIN_MEMCPY zmemcpy
+#define Z_BUILTIN_MEMSET zmemset
 #endif
 
 #if defined(INFLATE_CHUNK_SIMD_NEON)
@@ -31,6 +33,8 @@
 #elif defined(INFLATE_CHUNK_SIMD_SSE2)
 #include <emmintrin.h>
 typedef __m128i z_vec128i_t;
+#elif defined(INFLATE_CHUNK_GENERIC)
+typedef struct { uint8_t x[16]; } z_vec128i_t;
 #else
 #error chunkcopy.h inflate chunk SIMD is not defined for your build target
 #endif
@@ -265,6 +269,77 @@
 static inline void v_store_128(void* out, const z_vec128i_t vec) {
   _mm_storeu_si128((__m128i*)out, vec);
 }
+#elif defined(INFLATE_CHUNK_GENERIC)
+/*
+ * Default implementations for chunk-copy functions rely on memcpy() being
+ * inlined by the compiler for best performance.  This is most likely to work
+ * as expected when the length argument is constant (as is the case here) and
+ * the target supports unaligned loads and stores.  Since that's not always a
+ * safe assumption, this may need extra compiler arguments such as
+ * `-mno-strict-align` or `-munaligned-access`, or the availability of
+ * extensions like SIMD.
+ */
+
+/*
+ * v_load64_dup(): load *src as an unaligned 64-bit int and duplicate it in
+ * every 64-bit component of the 128-bit result (64-bit int splat).
+ */
+static inline z_vec128i_t v_load64_dup(const void* src) {
+  int64_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load32_dup(): load *src as an unaligned 32-bit int and duplicate it in
+ * every 32-bit component of the 128-bit result (32-bit int splat).
+ */
+static inline z_vec128i_t v_load32_dup(const void* src) {
+  int32_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load16_dup(): load *src as an unaligned 16-bit int and duplicate it in
+ * every 16-bit component of the 128-bit result (16-bit int splat).
+ */
+static inline z_vec128i_t v_load16_dup(const void* src) {
+  int16_t in;
+  Z_BUILTIN_MEMCPY(&in, src, sizeof(in));
+  z_vec128i_t out;
+  for (int i = 0; i < sizeof(out); i += sizeof(in)) {
+    Z_BUILTIN_MEMCPY((uint8_t*)&out + i, &in, sizeof(in));
+  }
+  return out;
+}
+
+/*
+ * v_load8_dup(): load the 8-bit int *src and duplicate it in every 8-bit
+ * component of the 128-bit result (8-bit int splat).
+ */
+static inline z_vec128i_t v_load8_dup(const void* src) {
+  int8_t in = *(const uint8_t*)src;
+  z_vec128i_t out;
+  Z_BUILTIN_MEMSET(&out, in, sizeof(out));
+  return out;
+}
+
+/*
+ * v_store_128(): store the 128-bit vec in a memory destination (that might
+ * not be 16-byte aligned) void* out.
+ */
+static inline void v_store_128(void* out, const z_vec128i_t vec) {
+  Z_BUILTIN_MEMCPY(out, &vec, sizeof(vec));
+}
 #endif
 
 /*
commit	d076d8bd089843ae105b1aeeda32dbeb667402ef	[log] [tgz]
author	Adenilson Cavalcanti <cavalcantii@chromium.org>	Fri Apr 05 21:50:32 2024 +0000
committer	Copybara-Service <copybara-worker@google.com>	Fri Apr 05 14:55:41 2024 -0700
tree	2970399bd7f74cb806889708f0e2d560bedcc801
parent	0e58d440d69cc64ccbe542ea1a968b9befb01544 [diff]